Exemple #1
0
def keep_task_run_time(start_time=None):
    """
    Keep execute of spark-task
    :param start_time: Start time, is a datetime
    :return:
    """
    # todo
    from pyspark.shell import spark

    # from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DataType  # 导入类型
    date_ = datetime.now()
    origin_data = spark.sparkContext.parallelize([
        (date_, 'test', (datetime.now() - start_time).seconds),
    ])
    # Use given schema
    # schema = StructType([
    #     StructField("generateTime", DataType(), True),
    #     StructField("name", StringType(), True),
    #     StructField("runTime", IntegerType(), True)
    # ])

    # Use the schema spark auto given
    spark_df = spark.createDataFrame(
        origin_data, schema=['generateTime', 'name', 'runTime'])

    # 保存数据,覆盖写
    file_ = r"/srv/BigData/dbdata_service/ffk/test_run_record/test.parquet"
    spark_df.write.parquet(path=file_, mode='overwrite')
def mP(a, tab, Aligner, sc):
    manager = multiprocessing.Manager()
    alignments = manager.list()
    cores = int(input('Inserisci il numero di processori: '))
    if cores > multiprocessing.cpu_count():
        cores = multiprocessing.cpu_count()
        print("Superato il numero massimo di processori,", str(cores),
              "in uso")
    else:
        print(str(cores), "processori in uso")
    processes = []
    data = ReadFile.SPARKreadFile(sc)
    dict = [x["SEQ"] for x in data.rdd.collect()]
    #dict = ReadFile.HengLireadFile() #Heng Li
    chunk_size = len(dict) / cores
    slices = Chunks(dict, math.ceil(chunk_size))
    for i, s in enumerate(slices):
        procname = 'processor' + str(i)
        p = multiprocessing.Process(target=Alignment.mPalignment,
                                    args=(a, tab, Aligner, s, alignments,
                                          procname))
        p.start()
        processes.append(p)
    for p in processes:
        p.join()
    DF = spark.createDataFrame(alignments)
    DataFrame = DF.join(data, on=['seq'], how='inner')
    return DataFrame
Exemple #3
0
 def createInterestVector(self):
     self.predict_interest=pd.read_csv(r'../analyed_data/fourth_predict_interest.csv',parse_dates=['report_date'])
     self.predict_interest = spark.createDataFrame(self.predict_interest)
     # vecAssembler = VectorAssembler(
     #     inputCols=['mfd_daily_yield', 'mfd_7daily_yield', 'Interest_O_N', 'Interest_1_W', 'Interest_2_W',
     #                'Interest_1_M', 'Interest_3_M', 'Interest_6_M', 'Interest_9_M', 'Interest_1_Y'],
     #     outputCol='features')
     vecAssembler = VectorAssembler(
         inputCols=[ 'Interest_3_M', 'Interest_6_M', 'Interest_9_M', 'Interest_1_Y'],
         outputCol='features')
     self.predict_interest = vecAssembler.transform(self.predict_interest)
Exemple #4
0
def write_log(level_log, program_name, procedure_name, message):
    log_row = namedtuple('log_row', 'TIME_LOG LEVEL_LOG PROGRAM_NAME PROCEDURE_NAME MESSAGE'.split())
    data = log_row(datetime.datetime.today(), level_log, program_name, procedure_name, message)
    result = spark.createDataFrame([data])
    result.write \
        .format('jdbc') \
        .mode('append') \
        .option('driver', 'oracle.jdbc.OracleDriver') \
        .option('url', DATABASE_SOURCE['url']) \
        .option('dbtable', 'log_table') \
        .option('user', DATABASE_SOURCE['user']) \
        .option('password', DATABASE_SOURCE['password']) \
        .save()
Exemple #5
0
def write_log(level_log, program_name, procedure_name, message):
    log_row = namedtuple('log_row', 'TIME_LOG LEVEL_LOG PROGRAM_NAME PROCEDURE_NAME MESSAGE'.split())
    data = log_row(datetime.datetime.today(), level_log, program_name, procedure_name, message)
    result = spark.createDataFrame([data])
    result.write \
        .format('jdbc') \
        .mode('append') \
        .option('driver', DRIVER) \
        .option('url', URL_SOURCE_DB) \
        .option('dbtable', LOG_TABLE_NAME) \
        .option('user', SOURCE_DB_USER_NAME) \
        .option('password', SOURCE_DB_USER_PASSWORD) \
        .save()
Exemple #6
0
def load_classifier_model():
    # model = PipelineModel.load("./movie-robot-model")
    # print(model)
    # if model != None:
    #     return model

    data_set = ModelProcessUtil.create_train_vectors()

    df = spark.createDataFrame(data_set)

    df.show()
    nb = NaiveBayes(modelType="bernoulli")
    nb_model = nb.fit(df)
    nb_model.setFeaturesCol("features")
    # nb_model.save("./movie-robot-model")
    nb_model.write().overwrite().save("./movie-robot-model")

    return nb_model
Exemple #7
0
    def createBalanceVector(self):
        # 读取训练集
        self.rawdata=pd.read_csv(r'../analyed_data/data_user_balance.csv',index_col='report_date',parse_dates=['report_date'])
        # 获取要删除的列名
        # self.drop_name = list(self.rawdata.columns[2:-10])
        # # 将没用到的列删除
        # self.drop_data = self.rawdata.drop(columns=self.drop_name)

        self.data = spark.createDataFrame(self.rawdata)
        # 用VectorAssembler来将多个列合并成一个列
        # vecAssembler = VectorAssembler(
        #     inputCols=['mfd_daily_yield', 'mfd_7daily_yield', 'Interest_O_N', 'Interest_1_W', 'Interest_2_W',
        #                'Interest_1_M', 'Interest_3_M', 'Interest_6_M', 'Interest_9_M', 'Interest_1_Y'],
        #     outputCol='features')
        vecAssembler = VectorAssembler(
            inputCols=['Interest_3_M', 'Interest_6_M', 'Interest_9_M', 'Interest_1_Y'],
            outputCol='features')
        self.balanceData=vecAssembler.transform(self.data)
Exemple #8
0
def setsCreation(multiplier, dataset):
    if dataset == 1:
        source_file = "creditcard_undersampled" + str(dataset)
    elif dataset == 2:
        source_file = "creditcard_undersampled" + str(dataset)
    elif dataset == 3:
        source_file = "creditcard_normalized" + str(dataset - 2)
    else:
        source_file = "creditcard_normalized" + str(dataset - 2)

    datas = []

    # Leggo il ds con pandas
    p_df = pd.read_csv(
        str(Path(__file__).parent) + "/CSV_Sources/" + source_file + ".csv")

    # Converto il ds pandas in un ds spark, dando un numero di partition pari alla radice quadrata degli elementi
    s_df = spark.createDataFrame(p_df).repartition(int(math.sqrt(len(p_df))))

    # Cachare è fondamentale quando si esegue questo codice in un cluster di almeno 2 macchine
    s_df.cache()
    for i in range(0, multiplier):
        datas.append(s_df.rdd.randomSplit([0.7, 0.3], seed=1234))

    #numTrainingData = datas[0][0].count()
    #numTestData = datas[0][1].count()
    #print(str(numTrainingData + numTestData) + " elementi sono stati divisi in " + str(numTrainingData) +" nel trainingData e "
    #      + str(numTestData) + " nel testData")
    #print("Indici di training: " + str(sorted(datas[0][0].map(lambda x: x[31]).collect())))
    #print("#####")
    #print("Indici di test: " + str(sorted(datas[0][1].map(lambda x: x[31]).collect())))
    # Creo una RDD di LabeledPoint
    # converted_data = s_df.rdd.map(lambda x: LabeledPoint(x[30], x[:30]))

    # Splitto i dati in training set e test set
    # (trainingData, testData) = converted_data.randomSplit([0.7, 0.3])

    return datas
Exemple #9
0
def write_log(level_log, program_name, procedure_name, message):
    """ Function for writing log

    :param level_log: level of logging, can be one of ["INFO", "WARN", "ERROR"];
    :param program_name: script's name;
    :param procedure_name: function's name;
    :param message: description of the recording.
    """
    log_row = namedtuple(
        'log_row',
        'TIME_LOG LEVEL_LOG PROGRAM_NAME PROCEDURE_NAME MESSAGE'.split())
    data = log_row(datetime.datetime.today(), level_log, program_name,
                   procedure_name, message)
    result = spark.createDataFrame([data])
    result.write \
        .format("jdbc") \
        .mode("append") \
        .option("driver", 'oracle.jdbc.OracleDriver') \
        .option("url", DATABASE_SOURCE['url']) \
        .option("dbtable", LOG_TABLE_NAME) \
        .option("user", DATABASE_SOURCE['user']) \
        .option("password", DATABASE_SOURCE['password']) \
        .save()
Exemple #10
0
# -*- coding:utf-8 -*-

from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.shell import spark

sentenceData = spark.createDataFrame(
    [(0.0, "Hi I heard about Spark"),
     (0.0, "I wish Java could use case classes"),
     (1.0, "Logistic regression models are neat")], ["label", "sentence"])

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)

hashingTF = HashingTF(inputCol="words",
                      outputCol="rawFeatures",
                      numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
# alternatively, CountVectorizer can also be used to get term frequency vectors

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

rescaledData.select("label", "features").show()
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession.builder.appName("OnlineNewsPopularity").getOrCreate()

data = sqlContext.read.format("csv").option("header", "true").option("inferSchema", "true").load(
    "OnlineNewsPopularity/OnlineNewsPopularity.csv")

# Standardization

# Define the `input_data`
input_data = data.rdd.map(lambda x: (x[0], DenseVector(x[1:5])))

# Replace `data` with the new DataFrame
data = spark.createDataFrame(input_data, ["label", "features"])

# data.show()

# Import `StandardScaler`
from pyspark.ml.feature import StandardScaler

# Initialize the `standardScaler`
standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled")

# Fit the DataFrame to the scaler
scaler = standardScaler.fit(data)

# Transform the data in `df` with the scaler
scaled_data = scaler.transform(data)
Exemple #12
0
from pyspark.ml.feature import StringIndexer
from pyspark.shell import spark

df = spark.createDataFrame(
    [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
    ["id", "category"])
df.show()
indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
indexed = indexer.fit(df).transform(df)
indexed.show()

def StringIndexerModel(input_col, output_col,input_data):
    indexer = StringIndexer(inputCol=input_col, outputCol=output_col)
    result = indexer.fit(input_data).transform(input_data)
    return result
Exemple #13
0
@author: xieweiwei
'''

from pyspark.ml.feature import VectorIndexer

from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, SparkSession
from pyspark.ml.feature import HashingTF, Tokenizer

# Prepare training documents from a list of (id, text, label) tuples.
# spark = SparkSession.builder.master("local").appName("Word Count").getOrCreate()
from pyspark.shell import spark

training = spark.createDataFrame([
    (0, "a b c d e spark", 1.0),
    (1, "b d", 0.0),
    (2, "spark f g h", 1.0),
    (3, "hadoop mapreduce", 0.0)
], ["id", "text", "label"])

tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.001)

pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

model = pipeline.fit(training)

test = spark.createDataFrame([
    (4, "spark i j k"),
    (5, "l m n"),
    (6, "spark hadoop spark"),
Exemple #14
0
# -*- coding:utf-8 -*-

from pyspark.ml.feature import Word2Vec

# Input data: Each row is a bag of words from a sentence or document.
from pyspark.shell import spark

documentDF = spark.createDataFrame([
    ("Hi I heard about Spark".split(" "),),
    ("I wish Java could use case classes".split(" "),),
    ("Logistic regression models are neat".split(" "),)
], ["text"])

# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
model = word2Vec.fit(documentDF)

result = model.transform(documentDF)
for row in result.collect():
    text, vector = row
    print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))
Exemple #15
0
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
from pyspark.shell import spark

data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]), ),
        (Vectors.dense([4.0, 5.0, 0.0, 3.0]), ),
        (Vectors.dense([6.0, 7.0, 0.0, 8.0]), ),
        (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]), )]
df = spark.createDataFrame(data, ["features"])

r1 = Correlation.corr(df, "features").head()
print("Pearson correlation matrix:\n" + str(r1[0]))

r2 = Correlation.corr(df, "features", "spearman").head()
print("Spearman correlation matrix:\n" + str(r2[0]))
Exemple #16
0
"""
    :author: young
    :DATE: 2019/5/14 14:20
    :copyright: © 2019 young <*****@*****.**>
    :license: None, see LICENSE for more details.
"""

from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression

# Prepare training data from a list of (label, features) tuples.
from pyspark.shell import spark

training = spark.createDataFrame([(1.0, Vectors.dense([0.0, 1.1, 0.1])),
                                  (0.0, Vectors.dense([2.0, 1.0, -1.0])),
                                  (0.0, Vectors.dense([2.0, 1.3, 1.0])),
                                  (1.0, Vectors.dense([0.0, 1.2, -0.5]))],
                                 ["label", "features"])

# Create a LogisticRegression instance. This instance is an Estimator.
lr = LogisticRegression(maxIter=10, regParam=0.01)
# Print out the parameters, documentation, and any default values.
print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")

# Learn a LogisticRegression model. This uses the parameters stored in lr.
model1 = lr.fit(training)

# Since model1 is a Model (i.e., a transformer produced by an Estimator),
# we can view the parameters it used during fit().
# This prints the parameter (name: value) pairs, where names are unique IDs for this
# LogisticRegression instance.
from pyspark.sql import Row
from pyspark.sql.functions import col, concat_ws, lit
from pyspark.sql.dataframe import DataFrame

from dependencies.spark import start_spark
schema = StructType([
    StructField("person.name", StringType(), True),
    StructField("person", StructType([
        StructField("name", StringType(), True),
        StructField("age", IntegerType(), True)]))
])

data = [
    ("charles", Row("chuck", 42)),
    ("lawrence", Row("larry", 73))
]
# spark = SparkSession.builder \
# .master("local") \
# .appName("Word Count") \
# .config("spark.some.config.option", "some-value") \
# .getOrCreate()

# s1 = SparkSession.builder.config("k1", "v1").getOrCreate()
# df = s1.createDataFrame(data, schema)
# df.show()
# df = sqlContext.createDataFrame(data, ["features"])
df = spark.createDataFrame(
    [("china", "asia"), ("colombia", "south america")],
    ["country.name", "continent"]
)
df.show()
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from pyspark.shell import spark
sentenceDataFrame = spark.createDataFrame(
    [(0, "Hi I heard about Spark"), (1, "I wish Java could use case classes"),
     (2, "Logistic,regression,models,are,neat")], ["id", "sentence"])

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

regexTokenizer = RegexTokenizer(inputCol="sentence",
                                outputCol="words",
                                pattern="\\W")
# alternatively, pattern="\\w+", gaps(False)

countTokens = udf(lambda words: len(words), IntegerType())

tokenized = tokenizer.transform(sentenceDataFrame)
tokenized.select("sentence", "words") \
    .withColumn("tokens", countTokens(col("words"))).show(truncate=False)

regexTokenized = regexTokenizer.transform(sentenceDataFrame)
regexTokenized.select("sentence", "words") \
    .withColumn("tokens", countTokens(col("words"))).show(truncate=False)
cols = ('age', 'sex', 'chest pain', 'resting blood pressure',
        'serum cholesterol', 'fasting blood sugar',
        'resting electrocardiographic results', 'maximum heart rate achieved',
        'exercise induced angina',
        'ST depression induced by exercise relative to rest',
        'the slope of the peak exercise ST segment',
        'number of major vessels ', 'thal', 'last')

data = pd.read_csv('heart.dat', delimiter=' ', names=cols)

data = data.iloc[:, 0:13]

data['isSick'] = data['thal'].apply(isSick)

df = spark.createDataFrame(data)

features = ('age', 'sex', 'chest pain', 'resting blood pressure',
            'serum cholestoral', 'fasting blood sugar',
            'resting electrocardiographic results',
            'maximum heart rate achieved', 'exercise induced angina',
            'ST depression induced by exercise relative to rest',
            'the slope of the peak exercise ST segment',
            'number of major vessels ')

assembler = VectorAssembler(inputCols=features, outputCol="features")

raw_data = assembler.transform(df)
raw_data.select("features").show(truncate=False)

standardscaler = StandardScaler().setInputCol("features").setOutputCol(
Exemple #20
0
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Prepare training documents, which are labeled.
from pyspark.shell import spark

training = spark.createDataFrame([(0, "a b c d e spark", 1.0), (1, "b d", 0.0),
                                  (2, "spark f g h", 1.0),
                                  (3, "hadoop mapreduce", 0.0),
                                  (4, "b spark who", 1.0), (5, "g d a y", 0.0),
                                  (6, "spark fly", 1.0),
                                  (7, "was mapreduce", 0.0),
                                  (8, "e spark program", 1.0),
                                  (9, "a e c l", 0.0),
                                  (10, "spark compile", 1.0),
                                  (11, "hadoop software", 0.0)],
                                 ["id", "text", "label"])

# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

# We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
# This will allow us to jointly choose parameters for all Pipeline stages.
# A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
# We use a ParamGridBuilder to construct a grid of parameters to search over.
Exemple #21
0
recipes_recom = []
recom_results = []
rated_results = []

user_index = 1

recipes = np.load('recipes.npy')

ratings = pd.read_csv('ratings.csv')

ratings_pivot = ratings.pivot_table(values='rating', index=['user_id'], columns=['recipe_id'], fill_value=0,
                                    dropna=False)

ratings_values = ratings_pivot.values

ratings = spark.createDataFrame(ratings)

string_indexer1 = StringIndexer(inputCol="user_id", outputCol="user_id_index")
string_indexer2 = StringIndexer(inputCol="recipe_id", outputCol="recipe_id_index")

indexers = [string_indexer1, string_indexer2]

pipeline = Pipeline(stages=indexers)

ratings_final = pipeline.fit(ratings).transform(ratings)

als = ALS(rank=20, maxIter=20, regParam=0.1, userCol="user_id_index", itemCol="recipe_id_index", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(ratings_final)

users_recs = model.recommendForAllUsers(10)
# featureIndexer=VectorIndexer(inputCol='features',outputCol='indexedFeatures',maxCategories=4).fit(data)
# (trainingData,testData)=data.randomSplit([0.7,0.3])
# gbt=GBTClassifier(labelCol='indexedLabel',featuresCol='indexedFeatures',maxIter=10)
# pipeline=Pipeline(labelIndexer,featureIndexer,gbt)
# model=pipeline.fit(trainingData)
# predictions=model.transform(testData)
# print(predictions)
#
# data=spark.read.format('libsvm').load('/media/four/four/spark-2.2.0-bin-hadoop2.7/data/mllib/sample_multiclass_classification_data.txt')
# print(data.show())
# splits=data.randomSplit([0.6,0.4],1234)
# train=splits[0]
# test=splits[1]
# layers=[4,5,4,3]
# trainer=MultilayerPerceptronClassifier(maxIter=100,layers=layers,blockSize=128,seed=1234)
# model=trainer.fit(train)
# result=model.transform(test)
# print (result)

# lsvc=LinearSVC(maxIter=10,regParam=0.1)
# lsvcModel=lsvc.fit(data)

from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]), ),
        (Vectors.dense([4.0, 5.0, 0.0, 3.0]), ),
        (Vectors.dense([6.0, 7.0, 0.0, 8.0]), ),
        (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]), )]
df = spark.createDataFrame(data, ['features'])
r1 = Correlation.corr(df, 'features', 'spearman')
print(r1.head())
import pandas as pd
import numpy as np
from pyspark.shell import spark

length = 100
names = np.random.choice(['Bob', 'James', 'Marek', 'Johannes', None], length)
amounts = np.random.randint(0, 1000000, length)
country = np.random.choice(
    ['United Kingdom', 'Poland', 'USA', 'Germany', None], length)
df = pd.DataFrame({'name': names, 'amount': amounts, 'country': country})
print(df.head())

# Default RDD partition creation
transactions = spark.createDataFrame(df)
print('Number of partitions: {}'.format(transactions.rdd.getNumPartitions()))
print('Partitioner: {}'.format(transactions.rdd.partitioner))
print('Partitions structure: {}'.format(transactions.rdd.glom().collect()))
print('Total transactions RDD instances: {}'.format(transactions.rdd.count()))

# Re-partitioned increased to 8
repartitioned = transactions.repartition(8)
print('Number of partitions: {}'.format(repartitioned.rdd.getNumPartitions()))
print('Partitions structure: {}'.format(repartitioned.rdd.glom().collect()))
print('Total repartitioned RDD instances: {}'.format(
    repartitioned.rdd.count()))

# Re-partitioning by specifying the column
repartitioned = transactions.repartition('country')
print('Number of partitions: {}'.format(repartitioned.rdd.getNumPartitions()))
print('Partitions structure: {}'.format(repartitioned.rdd.glom().collect()))
print('Total repartitioned RDD instances based on country column: {}'.format(
Exemple #24
0
from pyspark.shell import spark
from pyspark.sql import Row

sc = spark.sparkContext

# Load a text file and convert each line to a Row.
lines = sc.textFile("examples/src/main/resources/people.txt")
print('*****************')
print(type(lines))
parts = lines.map(lambda l: l.split(","))
people = parts.map(lambda p: Row(name=p[0], age=int(p[1])))
print(type(people))
print(people)
# Infer the schema, and register the DataFrame as a table.
schemaPeople = spark.createDataFrame(people)
schemaPeople.createOrReplaceTempView("people")

# SQL can be run over DataFrames that have been registered as a table.
teenagers = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")

# The results of SQL queries are Dataframe objects.
# rdd returns the content as an :class:`pyspark.RDD` of :class:`Row`.
teenNames = teenagers.rdd.map(lambda p: "Name: " + p.name).collect()
for name in teenNames:
    print(name)
Exemple #25
0
    # sc = SparkContext(master, "Twitter_Sentiment_Analysis")
    rdd = spark.sparkContext.wholeTextFiles(base)

    def extract_time_and_tweet(line):
        # print(line[len(base):])
        print(line[0])
        a_flume = line[1].split('\n')
        for tweet in a_flume:
            if len(tweet) > 3:
                data = json.loads(tweet)
                return 0.0, data['text'], data["timestamp_ms"]

    text2 = rdd.map(extract_time_and_tweet)
    text2.count()

    rawLabelTweetDataFrame = spark.createDataFrame(
        text2, ["label", "tweets", "time_stamp_ms"])

    regexTokenizer = RegexTokenizer(inputCol="tweets",
                                    outputCol="words",
                                    pattern="\\W")
    tokenized = regexTokenizer.transform(rawLabelTweetDataFrame)

    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    filteredDataFrame = remover.transform(tokenized).select(
        "label", "filtered", "time_stamp_ms")

    ngram = NGram(n=1, inputCol="filtered", outputCol="ngrams")
    ngramDataFrame = ngram.transform(filteredDataFrame)
    ngramDataFrame.show()

    ngramData = ngramDataFrame.select("label", "ngrams", "time_stamp_ms")
Exemple #26
0
from pyspark.ml.fpm import FPGrowth
from pyspark.shell import spark

df = spark.createDataFrame([(0, [1, 2, 5]), (1, [1, 2, 3, 5]), (2, [1, 2])],
                           ["id", "items"])

fpGrowth = FPGrowth(itemsCol="items", minSupport=0.5, minConfidence=0.6)
model = fpGrowth.fit(df)

# Display frequent itemsets.
model.freqItemsets.show()

# Display generated association rules.
model.associationRules.show()

# transform examines the input items against all the association rules and summarize the
# consequents as prediction
model.transform(df).show()