Esempio n. 1
0
def fit_kmeans(spark, products_df):
    step = 0

    step += 1
    tokenizer = Tokenizer(inputCol="title", outputCol=str(step) + "_tokenizer")

    step += 1
    stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol=str(step) + "_stopwords")

    step += 1
    tf = HashingTF(inputCol=stopwords.getOutputCol(), outputCol=str(step) + "_tf", numFeatures=16)

    step += 1
    idf = IDF(inputCol=tf.getOutputCol(), outputCol=str(step) + "_idf")

    step += 1
    normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol=str(step) + "_normalizer")

    step += 1
    kmeans = KMeans(featuresCol=normalizer.getOutputCol(), predictionCol=str(step) + "_kmeans", k=2, seed=20)

    kmeans_pipeline = Pipeline(stages=[tokenizer, stopwords, tf, idf, normalizer, kmeans])

    model = kmeans_pipeline.fit(products_df)
    words_prediction = model.transform(products_df)
    model.save("./kmeans")  # the whole machine learning instance is saved in a folder
    return model, words_prediction
def token(dataframe, in_col, out_col):
    
    tokenizer = Tokenizer(inputCol=in_col, outputCol=out_col)
    dataframe = tokenizer.transform(dataframe)
    
    dataframe.printSchema()
    
    return dataframe
def textPredict(request):
    """6.文本聚类,热度预测"""
    label = request.POST['label']
    title = request.POST['title']

    conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    """处理数据集,生成特征向量"""
    dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet')
    print(dfTitles.dtypes)
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(dfTitles)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    rescaledData.show()
    for features_label in rescaledData.select("features", "rawFeatures").take(3):
        print(features_label)
    """决策树模型培训"""
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData)
    featureIndexer =\
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData)
    (trainingData, testData) = rescaledData.randomSplit([0.7, 0.3])
    dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
    model = pipeline.fit(trainingData)
    """模型测试"""
    predictions = model.transform(testData)
    predictions.show()
    predictions.select("prediction", "indexedLabel", "features").show(5)
    """用户数据测试,单个新闻测试"""
    sentenceData = sqlContext.createDataFrame([
        (label,title),
    ],['label',"title"])
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(sentenceData)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    rescaledData = idfModel.transform(featurizedData)
    myprediction = model.transform(rescaledData)
    print("==================================================")
    myprediction.show()
    resultList = convertDfToList(myprediction)

    """模型评估"""
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g " % (1.0 - accuracy))

    treeModel = model.stages[2]
    print(treeModel)

    sc.stop()
    return render(request,{'resultList':resultList})
def main():
    spark = SparkSession.builder.appName("DBPediaSpark").getOrCreate()

    args = getResolvedOptions(sys.argv, ['S3_INPUT_BUCKET',
                                         'S3_INPUT_KEY_PREFIX',
                                         'S3_OUTPUT_BUCKET',
                                         'S3_OUTPUT_KEY_PREFIX',
                                         'S3_MODEL_BUCKET',
                                         'S3_MODEL_KEY_PREFIX'])

    # This is needed to save RDDs which is the only way to write nested Dataframes into CSV format
    spark.sparkContext._jsc.hadoopConfiguration().set("mapred.output.committer.class",
                                                      "org.apache.hadoop.mapred.FileOutputCommitter")
    
    # Defining the schema corresponding to the input data. The input data does not contain the headers
    schema = StructType([StructField("label", IntegerType(), True), 
                         StructField("title", StringType(), True), 
                         StructField("abstract", StringType(), True)])
    
    # Download the data from S3 into two separate Dataframes
    traindf = spark.read.csv(('s3://' + os.path.join(args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'],
                                                   'train.csv')), header=False, schema=schema, encoding='UTF-8')
    validationdf = spark.read.csv(('s3://' + os.path.join(args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'],
                                                          'test.csv')), header=False, schema=schema, encoding='UTF-8')

    # Tokenize the abstract column which contains the input text
    tokenizer = Tokenizer(inputCol="abstract", outputCol="tokenized_abstract")

    # Save transformed training data to CSV in S3 by converting to RDD.
    transformed_traindf = tokenizer.transform(traindf)
    transformed_train_rdd = transformed_traindf.rdd.map(lambda x: (x.label, x.tokenized_abstract))
    lines = transformed_train_rdd.map(csv_line)
    lines.coalesce(1).saveAsTextFile('s3://' + os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'train'))

    # Similar data processing for validation dataset.
    transformed_validation = tokenizer.transform(validationdf)
    transformed_validation_rdd = transformed_validation.rdd.map(lambda x: (x.label, x.tokenized_abstract))
    lines = transformed_validation_rdd.map(csv_line)
    lines.coalesce(1).saveAsTextFile('s3://' + os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'validation'))

    # Serialize the tokenizer via MLeap and upload to S3
    SimpleSparkSerializer().serializeToBundle(tokenizer, "jar:file:/tmp/model.zip", transformed_validation)

    # Unzip as SageMaker expects a .tar.gz file but MLeap produces a .zip file.
    import zipfile
    with zipfile.ZipFile("/tmp/model.zip") as zf:
        zf.extractall("/tmp/model")

    # Write back the content as a .tar.gz file
    import tarfile
    with tarfile.open("/tmp/model.tar.gz", "w:gz") as tar:
        tar.add("/tmp/model/bundle.json", arcname='bundle.json')
        tar.add("/tmp/model/root", arcname='root')

    s3 = boto3.resource('s3')
    file_name = os.path.join(args['S3_MODEL_KEY_PREFIX'], 'model.tar.gz')
    s3.Bucket(args['S3_MODEL_BUCKET']).upload_file('/tmp/model.tar.gz', file_name)
Esempio n. 5
0
def run_tf_idf_spark_ml(df, numFeatures=1 << 20):
    tokenizer = Tokenizer(inputCol="body", outputCol="words")
    wordsData = tokenizer.transform(df)

    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numFeatures)
    featurizedData = hashingTF.transform(wordsData)

    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)

    return idfModel.transform(featurizedData)
def predictLabel(label,title,model):
    """预测新闻的标签"""
    sentenceData = sqlContext.createDataFrame([
        (label,title),
    ],['label',"title"])
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(sentenceData)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    rescaledData = idfModel.transform(featurizedData)
    myprediction = model.transform(rescaledData)
    return myprediction
def create_features(raw_data):
    #Create DataFrame
    data_df = sqlContext.createDataFrame(raw_data.map(lambda r : Row(appid=r[0], price=r[1], sentence=r[2])))
    #Transform sentence into words
    tokenizer = Tokenizer(inputCol='sentence', outputCol='words')
    words_df = tokenizer.transform(data_df)
    #Calculate term frequency
    hashingTF = HashingTF(inputCol='words', outputCol='rawFeatures', numFeatures=5)
    featurized_df = hashingTF.transform(words_df)
    #Calculate inverse document frequency
    idf = IDF(inputCol='rawFeatures', outputCol='features')
    idfModel = idf.fit(featurized_df)
    return idfModel.transform(featurized_df)
def preprocessing_titles(path,name):
    query = preprocessData(path)
    tokenizer = Tokenizer(inputCol="title", outputCol="tokenized_title")
    wordsData = tokenizer.transform(query)
    #after Stopword removal
    remover = StopWordsRemover(inputCol="tokenized_title", outputCol="filtered")
    wordsData= remover.transform(wordsData)
    
    df = wordsData.map(lambda x:x['id']).zipWithUniqueId().toDF(["id","index"])
    df.registerTempTable("indices")
    wordsData.registerTempTable("words")
    
    qr = sqlContext.sql("SELECT index,words.id,filtered FROM indices JOIN words ON words.id = indices.id")
    if name!='':
        exportOnS3(qr,"s3a://redit-preprocessed/",name)
    qr = qr.map(lambda Row:(Row['index'],Row['id'],Row['filtered']))
Esempio n. 9
0
 def _build_stages(self):
     self.bs_parser = BeautifulSoupParser(inputCol="review", outputCol="parsed")
     self.tokenizer = Tokenizer(inputCol=self.bs_parser.getOutputCol(), outputCol="words")
     self.hashing_tf = HashingTF(inputCol=self.tokenizer.getOutputCol(), outputCol="raw_features")
     self.idf_model = IDF(inputCol=self.hashing_tf.getOutputCol(), outputCol="features")
     self.lr = LogisticRegression(maxIter=10, regParam=0.01)
     return [self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr]
Esempio n. 10
0
    def getPipeline(self, df):
        # notify pipeline 
        self.success('Initializing ML Pipeline ...')

        # initialize our tokenizer, we're going to tokenize features
        tokenizer = Tokenizer(inputCol='tag_features', outputCol='words')
        # convert the tokenize data to vectorize data
        hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol='features')
        # initialize logistic regression algorithm
        lr        = LogisticRegression(maxIter=10, regParam=0.01)
        # create / initialize the ml pipeline
        pipeline  = Pipeline(stages=[tokenizer, hashingTF, lr])

        # fit the pipeline on our training dataframe
        model = pipeline.fit(df)

        return model
Esempio n. 11
0
def get_top_words(dataset, signatures):
    # TODO: Use stemmers for the languages supported by http://www.nltk.org/api/nltk.stem.html#nltk.stem.snowball.SnowballStemmer
    # Or translate comments in other languages using the free Microsoft Translate API.
    sentenceData = dataset.filter(dataset['user_comments'].isNotNull() & (dataset['useragent_locale'].isNull() | (functions.instr(dataset['useragent_locale'], 'en') == 1)))

    if sentenceData.rdd.isEmpty():
        return dict()

    # Tokenize comments.
    tokenizer = Tokenizer(inputCol='user_comments', outputCol='words')
    wordsData = tokenizer.transform(sentenceData)

    # Remove duplicate words from comments.
    wordsData = wordsData.rdd.map(lambda p: (p['signature'], list(set(p['words'])))).reduceByKey(lambda x, y: x + y).toDF(['signature', 'words'])

    if wordsData.rdd.isEmpty():
        print("[WARNING]: wordsData is empty, sentenceData wasn't.")
        return dict()

    # Clean comment words by removing puntuaction and stemming.
    def clean_word(w):
        return re.sub('\,|\.|\;|\:|\;|\?|\!|\[|\]|\}|\{|\/|\\\\', '', stem(w.lower()))

    wordsData = wordsData.rdd.map(lambda p: (p['signature'], [clean_word(w) for w in p['words']])).toDF(['signature', 'words'])

    # XXX: Useless with TF-IDF?
    remover = StopWordsRemover(inputCol='words', outputCol='filtered')
    cleanWordsData = remover.transform(wordsData)

    cv = CountVectorizer(inputCol='filtered', outputCol='features')
    model = cv.fit(cleanWordsData)
    featurizedData = model.transform(cleanWordsData)

    idf = IDF(inputCol='features', outputCol='tfidf_features')
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    bests_per_doc = rescaledData.filter(rescaledData.signature.isin(signatures)).rdd.map(lambda p: (p['signature'], sorted(zip(p['tfidf_features'].indices, p['tfidf_features'].values), key=lambda i: i[1], reverse=True)[:10])).collect()

    return dict([(signature, [model.vocabulary[best] for best, val in bests]) for signature, bests in bests_per_doc])
def main():
    '''
    takes one input argument :: Location of the directory for training and test data files.
    :return: Print output on console for the area under the ROC curve.
    '''

    conf = SparkConf().setAppName("MLPipeline")
    sc = SparkContext(conf=conf)

    # Read training data as a DataFrame
    sqlCt = SQLContext(sc)
    trainDF = sqlCt.read.parquet("20news_train.parquet")

    # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
    lr = LogisticRegression(maxIter=20, regParam=0.1)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

    # Fit the pipeline to training data.
    model = pipeline.fit(trainDF)

    numFeatures = (1000, 5000, 10000)
    regParam = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)
    paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, numFeatures).addGrid(lr.regParam, regParam).build()


    cv = CrossValidator().setEstimator(pipeline).setEvaluator(BinaryClassificationEvaluator()).setEstimatorParamMaps(paramGrid).setNumFolds(2)

    # Evaluate the model on testing data
    testDF = sqlCt.read.parquet("20news_test.parquet")
    prediction = model.transform(testDF)
    evaluator = BinaryClassificationEvaluator()


    model_cv = cv.fit(trainDF)
    prediction_cv = model_cv.transform(testDF)
    print evaluator.evaluate(prediction)
    print evaluator.evaluate(prediction_cv)
Esempio n. 13
0
def run_tf_idf_spark_mllib(df, numFeatures=1 << 20):
    tokenizer = Tokenizer(inputCol="body", outputCol="words")
    wordsData = tokenizer.transform(df)

    words = wordsData.select("words").rdd.map(lambda x: x.words)

    hashingTF = MllibHashingTF(numFeatures)
    tf = hashingTF.transform(words)
    tf.cache()

    idf = MllibIDF().fit(tf)
    tfidf = idf.transform(tf)

    # @TODO make this nicer
    tmp = sqlContext.createDataFrame(wordsData.rdd.zip(tfidf), ["data", "features"])
    tmp.registerTempTable("tmp")
    old_columns = ', '.join(map(lambda x: 'data.%s' % x, wordsData.columns))
    with_features = sqlContext.sql("SELECT %s, features FROM tmp" % old_columns)
    tmp = sqlContext.createDataFrame(with_features.rdd.zip(tf), ["data", "rawFeatures"])
    tmp.registerTempTable("tmp")
    old_columns = ', '.join(map(lambda x: 'data.%s' % x, with_features.columns))
    return sqlContext.sql("SELECT %s, rawFeatures FROM tmp" % old_columns)
def main():
    # Read training data as a DataFrame
    sqlCt = SQLContext(sc)
    trainDF = sqlCt.read.parquet(training_input)
    testDF = sqlCt.read.parquet(testing_input)

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    evaluator = BinaryClassificationEvaluator()

    # no parameter tuning
    hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
    lr_notuning = LogisticRegression(maxIter=20, regParam=0.1)
    pipeline_notuning = Pipeline(stages=[tokenizer, hashingTF_notuning, lr_notuning])
    model_notuning = pipeline_notuning.fit(trainDF)

    prediction_notuning = model_notuning.transform(testDF)
    notuning_output = evaluator.evaluate(prediction_notuning)

    # for cross validation
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
    lr = LogisticRegression(maxIter=20)

    paramGrid = ParamGridBuilder()\
        .addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\
        .addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\
        .build()

    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
    cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2)
    cvModel = cv.fit(trainDF)

    # Make predictions on test documents. cvModel uses the best model found.
    best_prediction = cvModel.transform(testDF)
    best_output = evaluator.evaluate(best_prediction)

    s = str(notuning_output) + '\n' + str(best_output)
    output_data = sc.parallelize([s])
    output_data.saveAsTextFile(output)
Esempio n. 15
0
class BaselinePipelineEngine(PipelineEngine):
    @keyword_only
    def __init__(self, cv):
        super(BaselinePipelineEngine, self).__init__(cv)
        self.hashing_tf_map = [pow(2, 20)]
        self.lr_map = [0.1, 0.01]
        self.stages = self._build_stages()
        self.pipeline = Pipeline(stages=[self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr])
        self.param_grid = self._build_param_grid()

    def _build_stages(self):
        self.bs_parser = BeautifulSoupParser(inputCol="review", outputCol="parsed")
        self.tokenizer = Tokenizer(inputCol=self.bs_parser.getOutputCol(), outputCol="words")
        self.hashing_tf = HashingTF(inputCol=self.tokenizer.getOutputCol(), outputCol="raw_features")
        self.idf_model = IDF(inputCol=self.hashing_tf.getOutputCol(), outputCol="features")
        self.lr = LogisticRegression(maxIter=10, regParam=0.01)
        return [self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr]

    def _build_param_grid(self):
        param_grid_builder = ParamGridBuilder()
        param_grid_builder.addGrid(self.hashing_tf.numFeatures, self.hashing_tf_map)
        param_grid_builder.addGrid(self.lr.regParam, self.lr_map)
        return param_grid_builder.build()
Esempio n. 16
0
path = './txt_p'
files = [f for f in os.listdir(path) if os.path.split(f)]
filecontent = len(files)
dataset = []
cont = 0
for f in files:
    j = os.path.join(path, f)
    with open(j, 'r') as myfile:
        data = myfile.read().replace('\n', '')
        cont = cont + 1
        dataset.append((cont, f, data))

rdd = sc.parallelize(dataset)
shemaData = rdd.map(lambda x: Row(num=x[0], title=x[1], text=x[2]))
dataFrame = sqlContext.createDataFrame(shemaData)
tokenizer = Tokenizer(inputCol="text", outputCol="words")
wordsData = tokenizer.transform(dataFrame)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.select("title", "features").show()
#Normalizacion y transformada de la matriz
normalizer = Normalizer(inputCol="features", outputCol="norm")
data = normalizer.transform(rescaledData)

#Proceso de similaridad hallando la norma y el producto punto
mat = IndexedRowMatrix(
    data.select("num", "norm")\
        .rdd.map(lambda row: IndexedRow(row.num, row.norm.toArray()))).toBlockMatrix()
def test_gen_estimator_metadata(spark_session):  # pylint: disable=unused-argument
    tokenizer1 = Tokenizer(inputCol="text1", outputCol="words1")
    hashingTF1 = HashingTF(inputCol=tokenizer1.getOutputCol(),
                           outputCol="features1")

    tokenizer2 = Tokenizer(inputCol="text2", outputCol="words2")
    hashingTF2 = HashingTF(inputCol=tokenizer2.getOutputCol(),
                           outputCol="features2")

    vecAssembler = VectorAssembler(inputCols=["features1", "features2"],
                                   outputCol="features")

    lor = LogisticRegression(maxIter=10)
    ova = OneVsRest(classifier=lor)
    sub_pipeline1 = Pipeline(stages=[tokenizer1, hashingTF1])
    sub_pipeline2 = Pipeline(stages=[tokenizer2, hashingTF2])
    sub_pipeline3 = Pipeline(stages=[vecAssembler, ova])

    paramGrid = (ParamGridBuilder().addGrid(lor.maxIter, [10, 20]).addGrid(
        lor.regParam, [0.1, 0.01]).build())
    eva = MulticlassClassificationEvaluator()
    crossval = CrossValidator(estimator=sub_pipeline3,
                              estimatorParamMaps=paramGrid,
                              evaluator=eva,
                              numFolds=2)

    top_pipeline = Pipeline(stages=[sub_pipeline1, sub_pipeline2, crossval])

    metadata = _gen_estimator_metadata(top_pipeline)

    expected_hierarchy = {
        "name":
        "Pipeline_1",
        "stages": [
            {
                "name": "Pipeline_2",
                "stages": [{
                    "name": "Tokenizer_1"
                }, {
                    "name": "HashingTF_1"
                }]
            },
            {
                "name": "Pipeline_3",
                "stages": [{
                    "name": "Tokenizer_2"
                }, {
                    "name": "HashingTF_2"
                }]
            },
            {
                "name": "CrossValidator",
                "evaluator": {
                    "name": "MulticlassClassificationEvaluator"
                },
                "tuned_estimator": {
                    "name":
                    "Pipeline_4",
                    "stages": [
                        {
                            "name": "VectorAssembler"
                        },
                        {
                            "name": "OneVsRest",
                            "classifier": {
                                "name": "LogisticRegression"
                            }
                        },
                    ],
                },
            },
        ],
    }
    assert metadata.hierarchy == expected_hierarchy
    assert metadata.uid_to_indexed_name_map == {
        top_pipeline.uid: "Pipeline_1",
        sub_pipeline1.uid: "Pipeline_2",
        tokenizer1.uid: "Tokenizer_1",
        hashingTF1.uid: "HashingTF_1",
        sub_pipeline2.uid: "Pipeline_3",
        tokenizer2.uid: "Tokenizer_2",
        hashingTF2.uid: "HashingTF_2",
        crossval.uid: "CrossValidator",
        sub_pipeline3.uid: "Pipeline_4",
        vecAssembler.uid: "VectorAssembler",
        ova.uid: "OneVsRest",
        lor.uid: "LogisticRegression",
        eva.uid: "MulticlassClassificationEvaluator",
    }
    assert (metadata.uid_to_indexed_name_map[
        metadata.param_search_estimators[0].uid] == "CrossValidator")
Esempio n. 18
0
df_news = sqlContext.sql("SELECT Date, Top1,Top2,Top25 FROM combined_news_djia_csv")

num_word_features = 2000

#news data only goes to july 2016
df_news = sqlContext.sql("SELECT * FROM combined_news_djia_csv")
df_news = df_news.select("Date",concat(col("Top1"), lit(" "), col("Top2"), lit(" "), col("Top3"), lit(" "), col("Top4"), lit(" "), col("Top5"), lit(" "), col("Top6"), lit(" "), col("Top7"), lit(" "), col("Top8"), lit(" "), col("Top9"), lit(" "), col("Top10"), lit(" "), col("Top11"), lit(" "), col("Top12"), lit(" "), col("Top13"), lit(" "), col("Top14"), lit(" "), col("Top15"), lit(" "), col("Top16"), lit(" "), col("Top17"), lit(" "), col("Top18"), lit(" "), col("Top19"), lit(" "), col("Top20"), lit(" "), col("Top21"), lit(" "), col("Top22"), lit(" "), col("Top23"), lit(" "), col("Top24"), lit(" "), col("Top25")).alias("all_text_dirty"))

df_news = df_news.withColumn("all_text_1",regexp_replace(col("all_text_dirty"), "['\"]", ""))
df_news = df_news.withColumn("all_text",expr("substring(all_text_1, 2, length(all_text_1)+1)"))


df_news = df_news.dropna()

tokenizer = Tokenizer(inputCol="all_text", outputCol="words")
wordsData = tokenizer.transform(df_news)

remover = StopWordsRemover(inputCol="words", outputCol="wordsFil")
wordsDataFil = remover.transform(wordsData)

hashingTF = HashingTF(inputCol="wordsFil", outputCol="rawFeatures", numFeatures=num_word_features)
featurizedData = hashingTF.transform(wordsDataFil)
# alternatively, CountVectorizer can also be used to get term frequency vectors

idf = IDF(inputCol="rawFeatures", outputCol="news_features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

#df_news = rescaledData.select("Date","news_features")
Esempio n. 19
0
#Import csv for training data
start_data = spark.read.format("csv").option("header", "true").load("data/sepsis.csv")
#DATA CLEANUP
#Remove NULLs
start_data = start_data.na.drop(subset=["CATEGORY","COMMENT"])
#Filter to ensure that category is pulled in correctly
start_data = start_data.filter(start_data['CATEGORY'].isin('include','exclude'))

#BUILD FEATURES
# Create a length column to be used as a future feature 
from pyspark.sql.functions import length
data = start_data.withColumn('length', length(start_data['COMMENT']))

# Create all the features to the data set
include_exclude_to_num = StringIndexer(inputCol='CATEGORY',outputCol='label')
tokenizer = Tokenizer(inputCol="COMMENT", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')

# Create feature vectors
# See https://spark.apache.org/docs/latest/ml-features.html#vectorassembler
# This just creates a new, single vector of features that is the concatenation
# of tf-idf data and the length of the email
clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')

#DATA PROCESSING PIPELINE
# Create a and run a data processing Pipeline
# See https://spark.apache.org/docs/latest/ml-pipeline.html#pipeline
data_prep_pipeline = Pipeline(stages=[include_exclude_to_num, tokenizer, stopremove, hashingTF, idf, clean_up])
Esempio n. 20
0
# removed stop words
# applied the hashing trick
# converted the data from counts to IDF and
# trained a linear regression model.
# Each of these steps was done independently. This seems like a great application for a pipeline!

# Instructions
# 100 XP
# Instructions
# 100 XP
# Create an object for splitting text into tokens.
# Create an object to remove stop words. Rather than explicitly giving the input column name, use the getOutputCol() method on the previous object.
# Create objects for applying the hashing trick and transforming the data into a TF-IDF. Use the getOutputCol() method again.
# Create a pipeline which wraps all of the above steps as well as an object to create a Logistic Regression model.

from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF

# Break text into tokens at non-word characters
tokenizer = Tokenizer(inputCol='text', outputCol='words')

# Remove stop words
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                           outputCol='terms')

# Apply the hashing trick and transform to TF-IDF
hasher = HashingTF(inputCol=remover.getOutputCol(), outputCol="hash")
idf = IDF(inputCol=hasher.getOutputCol(), outputCol="features")

# Create a logistic regression object and add everything to a pipeline
logistic = LogisticRegression()
pipeline = Pipeline(stages=[tokenizer, remover, hasher, idf, logistic])
Esempio n. 21
0
def sentence_data(df_data):
    df_data2 = df_data.select(df_data._id,
                              removepunctuations(df_data.text_entry))
    only_words = Tokenizer(inputCol='textentry', outputCol="words")
    df_data3 = only_words.transform(df_data2)
    return df_data3
Esempio n. 22
0
	def trainModel(self):
		
		logger.info("Training the model...")		

		query = '''select page_id, max(page_title) as page_title from cooladata where date_range(all) and page_id is not null group by page_id;'''

		def SQLtoURL(query):
    
    			data = query.replace('\n', ' ').replace('\t',' ').replace('   ',' ').replace('  ',' ')
    			return data


		def QueryXXXXX(query, file = None):
   
    			session = Session()
    			response = session.post(data = {'tq': query,}, url = 'https://app.XXXXXX.com/api/v2/projects/115659/cql/', headers = {'Authorization': 'Token dtQvPVejNcSebX1EkU0AqB2TJRXznIgZiDvDu3HR'},)
    			return response.content

		table = json.loads(codecs.decode(QueryCoola(SQLtoURL(query)),'utf-8'))['table']
		title_list = [x['c'] for x in table['rows']]
		table_cols = [d['label'] for d in table['cols']]  
		def convert_row(row):
    			rowlist = [d['v'] for d in row]
    			return rowlist

		rd = self.sc.parallelize(title_list).map(convert_row)
		titleData = self.spark.createDataFrame(rd, table_cols)
		titleData = titleData.dropna()
		
		hebrew_stopwords = stop_words()
		def rmv(words):
    			for punc in punctuation:
        			words = words.replace(punc,"")
    			for hword in hebrew_stopwords:
        			words = words.replace(hword, " ")
    			return words

		self.spark.udf.register("rmv", rmv, StringType())
		titleData.registerTempTable("wordstable")
		cleanedSentenceData = self.spark.sql("select page_id, page_title, rmv(page_title) as cleanedSentence from wordstable")
		tokenizer = Tokenizer(inputCol="cleanedSentence", outputCol="words")
		wordsData = tokenizer.transform(cleanedSentenceData)

		cv = CountVectorizer(inputCol="words", outputCol="rawFeatures", minDF = 2.0)
		cvModel = cv.fit(wordsData)
		featurizedData = cvModel.transform(wordsData)

		idf = IDF(inputCol="rawFeatures", outputCol="features")
		idfModel = idf.fit(featurizedData)
		rescaledData = idfModel.transform(featurizedData)

		lda = LDA(k=100)
		ldaModel = lda.fit(rescaledData)
		postFactorizedData = ldaModel.transform(rescaledData)

		norm = Normalizer(inputCol = "topicDistribution", outputCol="normTopicDist")
		scaledFactorizedNormalizedData = norm.transform(postFactorizedData)
		
		self.model = scaledFactorizedNormalizedData
		
		logger.info("model is built!")
Esempio n. 23
0
def main():

    set_pandas_options()
    app_name = "Case Study 2: Email Analytics"

    conf = SparkConf().setAppName(app_name)
    conf = (conf.setMaster('local[*]').set(
        "spark.driver.host",
        "localhost").set('spark.executor.memory',
                         '4G').set('spark.driver.memory',
                                   '8G').set('spark.driver.maxResultSize',
                                             '10G'))
    sc = SparkContext(conf=conf)
    spark = SparkSession(sc)

    log4jLogger = sc._jvm.org.apache.log4j
    LOGGER = log4jLogger.LogManager.getLogger(__name__)
    LOGGER.info("pyspark script logger initialized")

    # 1 Load data into Spark DataFrame
    LOG = get_hdfs_filepath('*/*/*')

    # read text file
    log_txt_df = sc.wholeTextFiles(LOG).filter(lambda line: line != '').toDF()
    # Convert strings to columns
    udf1 = udf(to_utc_timestamp, TimestampType())
    df = log_txt_df
    df = df.select(df._2.alias('line'))
    udf1 = udf(to_utc_timestamp, TimestampType())
    temp = df.select(
        regexp_extract(col('line'), r'Message-ID:\s<.*>',
                       0).alias('Message_ID'),
        regexp_extract(
            col('line'),
            r'\d{1,2}\s\w{3}\s\d{4}\s\d{2}:\d{2}:\d{2}\s(\+|\-)\d{4}(.*)',
            0).alias("Date"),
        regexp_extract(col('line'), r'From:\s(.*)', 0).alias("From"),
        regexp_extract(
            col('line'),
            r"To:\s(.+)((?:\n|\r\n?)((?:(?:\n|\r\n?).+)+)){0,}(\S+@\S+)(?:\n|\r\n?)Subject:\s",
            0).alias("To"),
        regexp_extract(
            col('line'),
            r"Subject:\s(.+)((?:\n|\r\n?)((?:(?:\n|\r\n?).+)+)){0,}",
            1).alias("Subject"),
        regexp_extract(
            col('line'),
            r"Cc:\s(.+)((?:\n|\r\n?)((?:(?:\n|\r\n?).+)+)){0,}(?:\n|\r\n?)Mime-Version:\s",
            0).alias("Cc"),
        regexp_extract(col('line'), r'Mime-Version:\s(.+)',
                       1).alias("Mime_Version"),
        regexp_extract(col('line'), r'Content-Type:\s(.*)',
                       1).alias("Content_Type"),
        regexp_extract(col('line'), r"Content-Transfer-Encoding:\s(.+)",
                       1).alias("Content_Transfer_Encoding"),
        regexp_extract(col('line'), r"X-From:\s(.*)(?:\n|\r\n?)X-To:\s",
                       0).alias("X_From"),
        regexp_extract(col('line'), r'X-To:\s(.*)(?:\n|\r\n?)X-cc:\s',
                       0).alias("X_To"),
        regexp_extract(col('line'), r'X-cc:\s(.*)(?:\n|\r\n?)X-bcc:\s',
                       0).alias("X_cc"),
        regexp_extract(col('line'), r'X-bcc:\s(.*)(?:\n|\r\n?)X-Folder:\s',
                       0).alias("X_bcc"),
        regexp_extract(col('line'), r'X-Folder:\s(.*)(?:\n|\r\n?)X-Origin:\s',
                       0).alias("X_Folder"),
        regexp_extract(col('line'),
                       r"X-Origin:\s(.*)(?:\n|\r\n?)X-FileName:\s",
                       0).alias("X_Origin"),
        regexp_extract(col('line'), r"X-FileName:\s(.*)",
                       0).alias("X_FileName"),
        regexp_extract(
            col('line'),
            r"X-FileName:\s(.*)((?:\n|\r\n?){1,}(.*)){1,}((?:(?:\n|\r\n?).+)+)",
            0).alias("FYI"))
    #temp.cache()
    temp1 = temp.select(
        expr("substring(Message_ID, 14, length(Message_ID)-14)").alias(
            "Message_ID"), 'Date',
        udf1('Date').alias('UTC_timestamp'),
        expr("substring(From, 7, length(From)-6)").alias("From"),
        expr("substring(To, 5, length(To)-15)").alias("To"), "Subject",
        expr("substring(Cc, 5, length(Cc)-20)").alias("Cc"), "Mime_Version",
        "Content_Type", 'Content_Transfer_Encoding',
        expr("substring(X_From, 9, length(X_From)-16)").alias("X_From"),
        expr("substring(X_To, 7, length(X_To)-14)").alias("X_To"),
        expr("substring(X_cc, 7, length(X_cc)-15)").alias("X_cc"),
        expr("substring(X_bcc, 8, length(X_bcc)-19)").alias("X_bcc"),
        expr("substring(X_Folder, 11, length(X_Folder)-22)").alias("X_Folder"),
        expr("substring(X_Origin, 11, length(X_Origin)-24)").alias("X_Origin"),
        expr("substring(X_FileName, 13, length(X_FileName)-15)").alias(
            "X_FileName"),
        regexp_replace(
            col('FYI'),
            r"(X-FileName:\s(.*)(?:\n|\r\n?){1,})|(-*Original Message-*(.*)((?:\n|\r\n?){1,}(.*)){0,}((?:(?:\n|\r\n?).+)+))",
            '').alias('FYI'))
    #temp1.cache()
    result = temp1.select(
        "Message_ID", 'Date', 'UTC_timestamp', "From",
        regexp_replace(col('To'), r"\r\n\t", "").alias("To"), "Subject",
        regexp_replace(col('Cc'), r"\r\n\t", "").alias("Cc"), "Mime_Version",
        "Content_Type", 'Content_Transfer_Encoding', "X_From", "X_To", "X_cc",
        "X_bcc", "X_Folder", "X_Origin", "X_FileName",
        regexp_replace(col('FYI'), r"(^\s{1,})|(\n{2,})", '').alias('FYI'))
    zz = result.limit(5).toPandas()
    LOGGER.info(
        "\n\n1.\tLoad data into Spark DataFrame\tDone!\n\n{}\n".format(zz))

    # 2 Display the top 10 high-frequency users based on weekly numbers of emails sent
    df1 = result
    freq = df1.groupBy('From').agg(
        (count('UTC_timestamp') /
         ((max(unix_timestamp(col('UTC_timestamp'))) -
           min(unix_timestamp(col('UTC_timestamp')))) /
          604800)).alias('rate_per_week')).orderBy("rate_per_week",
                                                   ascending=False)
    zz = freq.limit(10).toPandas()
    LOGGER.info(
        "\n\n2.\tDisplay the top 10 high-frequency users based on weekly numbers of emails sent\tDone!\n\n{}\n"
        .format(zz))

    # 3a Extract top 20 keywords from the subject text for the top 10 high-frequency users
    top = freq.limit(10)
    top_subj = df1.join(top, df1["From"] == top["From"],
                        "inner").select(df1['From'], df1['Subject'])
    top_texts = top_subj.groupBy("From").agg(
        concat_ws(" ", collect_list("Subject")).alias("texts"))
    top_texts = top_texts.select('texts').agg(
        concat_ws(" ", collect_list("texts")).alias("subjects"))
    # Extract word
    from pyspark.ml.feature import Tokenizer
    tokenizer = Tokenizer().setInputCol("subjects").setOutputCol("words")
    transformed = tokenizer.transform(top_texts)
    # Extend the stop words dictionary by adding your own stop words such as -
    # Remove stopwords
    # custom stopwords
    stopwords = StopWordsRemover().getStopWords() + ["-", "re:", "", "fw"]
    remover = StopWordsRemover().setStopWords(stopwords).setInputCol(
        "words").setOutputCol("filtered")
    cleaned = remover.transform(transformed)
    # Extract top 20 keywords by identifying removing the common stop words
    # Generate features
    from pyspark.ml.feature import CountVectorizer, CountVectorizerModel
    cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol(
        "features").fit(cleaned)
    featured = cvmodel.transform(cleaned)
    counts = featured.select('features').collect()
    a = cvmodel.vocabulary
    b = counts[0]['features'].values
    d = {'words': a, 'counts': b}
    df = pd.DataFrame(d)
    zz = df.head(20)
    LOGGER.info(
        "\n\n3a.\tExtract top 20 keywords from the subject text for the top 10 high-frequency users\tDone!\n\n{}\n"
        .format(zz))
    # 3b Extract top 20 keywords from the subject text for the non-high frequency users
    w = Window().orderBy(lit('A'))
    bottom = freq.orderBy("rate_per_week",
                          ascending=False).withColumn("row_num",
                                                      row_number().over(w))
    bottom = bottom.where(col('row_num') > 10).select('From', 'rate_per_week')
    bottom_subj = df1.join(bottom, df1["From"] == bottom["From"],
                           "inner").select(df1["From"], df1["Subject"])
    bottom_texts = bottom_subj.groupBy("From").agg(
        concat_ws(" ", collect_list("Subject")).alias("texts"))
    bottom_texts = bottom_texts.select('texts').agg(
        concat_ws(" ", collect_list("texts")).alias("subjects"))
    # Extract word
    tokenizer = Tokenizer().setInputCol("subjects").setOutputCol("words")
    transformed = tokenizer.transform(bottom_texts)
    # Remove stopwords
    # custom stopwords
    stopwords = StopWordsRemover().getStopWords() + [
        "-", "re:", "fw:", "", "&"
    ]
    remover = StopWordsRemover().setStopWords(stopwords).setInputCol(
        "words").setOutputCol("filtered")
    cleaned = remover.transform(transformed)
    # Generate features
    cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol(
        "features").fit(cleaned)
    featured = cvmodel.transform(cleaned)
    counts = featured.select('features').collect()
    a = cvmodel.vocabulary
    b = counts[0]['features'].values
    d = {'words': a, 'counts': b}
    df = pd.DataFrame(d)
    zz = df.head(20)
    LOGGER.info(
        "\n\n3b.\tExtract top 20 keywords from the subject text for the non-high frequency users\tDone!\n\n{}\n"
        .format(zz))

    # 6 Introduce a new column label to identify new, replied, and forwarded messages
    df = result

    def to_label(sbj):
        l1 = "RE" if sbj.startswith("RE:") else (
            "FW" if sbj.startswith("FW:") else 'NEW')
        return l1

    udf2 = udf(to_label, StringType())
    df_with_label = df.withColumn('label', udf2("Subject"))
    zz = df_with_label.limit(5).toPandas()
    LOGGER.info(
        "\n\n6.\tIntroduce a new column label to identify new, replied, and forwarded messages\tDone!\n\n{}\n"
        .format(zz))

    # 7 Get the trend of the over mail activity using the pivot table from spark itself
    pivotDF = df_with_label.groupBy(
        year("UTC_timestamp").alias('year'),
        month("UTC_timestamp").alias('month')).pivot("label").count().orderBy(
            "year", "month")
    zz = pivotDF.na.fill(0).toPandas()
    LOGGER.info(
        "\n\n7.\tGet the trend of the over mail activity using the pivot table from spark itself\tDone!\n\n{}\n"
        .format(zz))

    # 8 Use k-means clustering to create 4 clusters from the extracted keywords
    raw = result.select("Message_ID", "From", "Subject")
    # Extract word
    from pyspark.ml.feature import Tokenizer
    tokenizer = Tokenizer().setInputCol("Subject").setOutputCol("words")
    transformed = tokenizer.transform(raw)
    # Remove stopwords
    # custom stopwords
    stopwords = StopWordsRemover().getStopWords() + [
        "-", "re:", "fw:", "", "&"
    ]
    remover = StopWordsRemover().setStopWords(stopwords).setInputCol(
        "words").setOutputCol("filtered")
    cleaned = remover.transform(transformed)
    cleaned = cleaned.select("Message_ID", "words", "filtered")
    # Generate features
    from pyspark.ml.feature import CountVectorizer, CountVectorizerModel
    cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol(
        "features").fit(cleaned)
    featured = cvmodel.transform(cleaned)
    kmeans = KMeans(k=4, seed=1)  # 4 clusters here
    model = kmeans.fit(featured.select('features'))
    transformed = model.transform(featured)
    zz = transformed.limit(5).toPandas()
    LOGGER.info(
        "\n\n8.\tUse k-means clustering to create 4 clusters from the extracted keywords\tDone!\n\n{}\n"
        .format(zz))

    # 9 Use LDA to generate 4 topics from the extracted keywords
    LOGGER.info(
        "\n\n9.\tUse LDA to generate 4 topics from the extracted keywords\tDone!\n\n{}\n{}\n{}\n{}\n"
        .format(get_topic(0, transformed), get_topic(1, transformed),
                get_topic(2, transformed), get_topic(3, transformed)))
Esempio n. 24
0
    #print "loading 20 newsgroups dataset..."
    tic = time()
    dataset = fetch_20newsgroups(shuffle=True, random_state=0, remove=('headers','footers','quotes'))
    train_corpus = dataset.data  # a list of 11314 documents / entries
    toc = time()
    print ("elapsed time: %.4f sec" %(toc - tic)   ) 

    #distribute data
    corpus_rdd = sc.parallelize(train_corpus)
    corpus_rdd = corpus_rdd.map(lambda doc: re.sub(r"[^A-Za-z]", " ", doc))
    corpus_rdd = corpus_rdd.map(lambda doc: u"".join(doc).encode('utf-8').strip())

    rdd_row = corpus_rdd.map(lambda doc: Row(raw_corpus=str(doc)))
    newsgroups = spark.createDataFrame(rdd_row)

    tokenizer = Tokenizer(inputCol="raw_corpus", outputCol="tokens")
    newsgroups = tokenizer.transform(newsgroups)
    newsgroups = newsgroups.drop('raw_corpus')       

    stopwords = StopWordsRemover(inputCol="tokens", outputCol="tokens_filtered")
    newsgroups = stopwords.transform(newsgroups)
    newsgroups = newsgroups.drop('tokens')

    count_vec = CountVectorizer(inputCol="tokens_filtered", outputCol="tf_features", vocabSize=num_features, minDF=2.0)
    count_vec_model = count_vec.fit(newsgroups)
    vocab = count_vec_model.vocabulary
    newsgroups = count_vec_model.transform(newsgroups)
    newsgroups = newsgroups.drop('tokens_filtered')

    #hashingTF = HashingTF(inputCol="tokens_filtered", outputCol="tf_features", numFeatures=num_features)
    #newsgroups = hashingTF.transform(newsgroups)
Esempio n. 25
0
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
import numpy as np

# Remove rows with null values
reviewText_data = csv_data.select('reviewText')
reviewText_data = reviewText_data.na.drop()

# Register a 'function' to clean text
cleantext = spark.udf.register("cleantext", clean_text)

# Cleaned reviewText data
clean_reviewText_data = reviewText_data.select(cleantext("reviewText").alias("reviewText"))

# Convert sentences into discrete words
tokenizer = Tokenizer(inputCol="reviewText", outputCol="words")

# Calculate term frequency for each word
tf = CountVectorizer(inputCol="words", outputCol="tf", vocabSize=2**6, minDF=0.05, minTF=1)

# Calculate IDF given the term frequency
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=1) #minDocFreq: remove sparse terms

# Fit the cleaned reviewText data through the pipeline
pipeline = Pipeline(stages=[tokenizer, tf, idf])
pipelineFit = pipeline.fit(clean_reviewText_data)
train_df = pipelineFit.transform(clean_reviewText_data)

# Save TF_IDF as text files in datanodes
train_df.rdd.saveAsTextFile("hdfs://ec2-34-239-131-131.compute-1.amazonaws.com:9000/output15/")
Esempio n. 26
0
def lower_text(line):
    word_list=re.findall('[\w_]+', line.lower())
    return ' '.join(map(str, word_list))

filter_data_withColumn = filter_data.withColumn("text_lower", udf(lower_text, StringType())("Text")).select('text_lower','Score')

#Showing the result
filter_data_withColumn.show(15)


# # Tokenize

# In[11]:


tokenize = Tokenizer(inputCol="text_lower", outputCol="words")
words_Data_Frame = tokenize.transform(filter_data_withColumn)
words_Data_Frame.take(5)


# # Remove Stopword

# In[12]:


remove = StopWordsRemover(inputCol="words", outputCol="filtered_words")
words_Data_Frame1 = remove.transform(words_Data_Frame).select("filtered_words","Score")
words_Data_Frame1.show(5)


# # Stemming
Esempio n. 27
0
spark = SQLContext(sc)

#LOADING DATA FROM HDFS TO SPARK DATAFRAME
df0=spark.read.option("sep", "\t").option('header',True).csv('hdfs://192.168.50.93:9000/user/hadoop/books2/amazon_reviews_us_Wireless_v1_00.tsv')
df0.printSchema()

#FILTERING FOR EMPTY VALUES
df01 = df0.filter((col("review_body").isNotNull()) & (col("verified_purchase").isNotNull()))

#ENCODING LABEL
stage_string = StringIndexer(inputCol="verified_purchase", outputCol="class_res")
ppl = Pipeline(stages=[stage_string])
df1 = ppl.fit(df01).transform(df01)

#CREATING TF_IDF
tokenizer = Tokenizer(inputCol="review_body", outputCol="words")
wordsData = tokenizer.transform(df1)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

#NAIVEBAYES 
nb = NaiveBayes(featuresCol="features", labelCol="class_res")

#Model training
model = nb.fit(rescaledData)

#Model Saving
model.write().overwrite().save("./NB_model")
Esempio n. 28
0
trainingCount=parts.count()
f = indexedTweets.map(lambda p: Row(tindex=int(p[1]),tweet=p[0][0], label= int(float(p[0][1])), training=1))
#f = parts.map(lambda p: Row(tweet=p[0],label=int(p[1])))

linest = sc.textFile("/home/ankita/MLProject/SVM/GroundTruth.txt")

partst = linest.map(lambda l: l.split(","))
indexedTweetst = partst.zipWithIndex().map(lambda (a,b): (a,b+trainingCount))
ft = indexedTweetst.map(lambda p: Row(tindex=int(p[1]),tweet=p[0][1], label= int(float(p[0][0])),training=0))
alldata = f.union(ft)

schemaTweets = sqlContext.createDataFrame(alldata)

schemaTweets.registerTempTable("data")

tokenizer = Tokenizer(inputCol="tweet", outputCol="words")
wordsData = tokenizer.transform(schemaTweets)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
featurizedData = hashingTF.transform(wordsData)


idf = IDF(inputCol="rawFeatures", outputCol="features")



idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
#rescaledData.collect()
wordsvectors = rescaledData.filter(rescaledData.training==1)["label","features"].map(lambda row: LabeledPoint(row[0], row[1]))
model = LogisticRegressionWithLBFGS.train(wordsvectors, iterations=100)
    u'more', u'most', u'must', u'my', u'myself', u'no', u'nor', u'not', u'now',
    u'o', u'of', u'off', u'on', u'once', u'only', u'or', u'other', u'our',
    u'ours', u'ourselves', u'out', u'over', u'own', u'r', u're', u's', 'said',
    u'same', u'she', u'should', u'shouldnt', u'so', u'some', u'such', u't',
    u'than', u'that', 'thats', u'the', u'their', u'theirs', u'them',
    u'themselves', u'then', u'there', u'these', u'they', u'this', u'those',
    u'through', u'to', u'too', u'under', u'until', u'up', u'very', u'was',
    u'wasnt', u'we', u'were', u'werent', u'what', u'when', u'where', u'which',
    u'while', u'who', u'whom', u'why', u'will', u'with', u'wont', u'would',
    u'y', u'you', u'your', u'yours', u'yourself', u'yourselves'
]

stopwordsRemover = StopWordsRemover(
    inputCol="words1", outputCol="filtered").setStopWords(add_stopwords)

tokenizer = Tokenizer(inputCol="Text", outputCol="tokens")
hashtf = HashingTF(numFeatures=2**16, inputCol="filtered", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features",
          minDocFreq=5)  #minDocFreq: remove sparse terms

# bag of words count

#hashtf = HashingTF(numFeatures=2**16, inputCol="tokens", outputCol='tf')
#
#idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
#
#

pipeline = Pipeline(
    stages=[regexTokenizer, stopwordsRemover, tokenizer, hashtf, idf])
Esempio n. 30
0
df_review = df_review.filter("cool >=3 or useful >=3 or funny >=3")
df_review = df_review.select("stars", "text")
df_review = df_review.repartition(100)

from pyspark.sql import functions as F
df_review = df_review.withColumn("target",
                                 F.when(df_review.stars <= 2, 1).otherwise(0))
df_review.cache()

(train_set, test_set) = df_review.randomSplit([0.7, 0.3], seed=1002)

from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

tokenizer = Tokenizer(inputCol="text", outputCol="words")
cv = CountVectorizer(vocabSize=2**16, inputCol="words", outputCol='cv')
idf = IDF(inputCol='cv', outputCol="features",
          minDocFreq=5)  #minDocFreq: remove sparse terms
label_stringIdx = StringIndexer(inputCol="target", outputCol="label")
#lr = LogisticRegression(maxIter=10)
pipeline = Pipeline(stages=[tokenizer, cv, idf, label_stringIdx])

pipelineFit = pipeline.fit(train_set)
train_df = pipelineFit.transform(train_set)
val_df = pipelineFit.transform(test_set)

lr = LogisticRegression(maxIter=100)
lrModel = lr.fit(train_df)
predictions = lrModel.transform(val_df)
#predictions = predictions.select('target','label', 'rawPrediction', 'probability', 'prediction')
Esempio n. 31
0
	except KafkaError as ke:
		logger.debug('Fail to start kafka producer, caused by %s' % ke.message)

	try:
		# Create dstream from kafka topic
		directKafkaStream = KafkaUtils.createDirectStream(ssc, kafka_topic, {'metadata.broker.list' = broker_ip})
		logger.debug('Create direct dstream from kafka successfully')
	except:
		logger.debug('Unable to create dstream from kafka')

	atexit.register(shutdown_hook, kafka_producer, spark)

	# Load in idf_model, nb_model, hashing_tf, idf_model and tag_catId map
	try:
		logger.debug('Loading models')
		tokenizer = Tokenizer.load(tokenizer_file)
		hashing_tf = HashingTF.load(hashing_tf_file)
		idf_model = IDFModel.load(idf_model_file)
		nb_model = NaiveBayesModel.load(nb_model_file)
		selected_tags = pd.read_csv(selected_tags_file, header=None)
		local_catId_to_tags = dict(zip(list(selected_tags.index), selected_tags[0]))
		local_tags_to_catId=dict(zip(selected_tags[0], list(selected_tags.index)))
		catId_to_tags = sc.broadcast(local_catId_to_tags)
		tags_to_catId = sc.broadcast(local_tags_to_catId)
		tags_to_catId_transform = udf(lambda tag: float(tags_to_catId.value[tag]), FloatType())
		catId_to_tags_transform = udf(lambda catId: catId_to_tags.value[catId], StringType())
		logger.debug('loaded models successfully')
	except:
		logger.debug('Fail to load models')

def transform(spark, s3_input_data, s3_output_train_data,
              s3_output_validation_data, s3_output_test_data):
    print('Processing {} => {}'.format(s3_input_data, s3_output_train_data,
                                       s3_output_validation_data,
                                       s3_output_test_data))

    schema = StructType([
        StructField('marketplace', StringType(), True),
        StructField('customer_id', StringType(), True),
        StructField('review_id', StringType(), True),
        StructField('product_id', StringType(), True),
        StructField('product_parent', StringType(), True),
        StructField('product_title', StringType(), True),
        StructField('product_category', StringType(), True),
        StructField('star_rating', IntegerType(), True),
        StructField('helpful_votes', IntegerType(), True),
        StructField('total_votes', IntegerType(), True),
        StructField('vine', StringType(), True),
        StructField('verified_purchase', StringType(), True),
        StructField('review_headline', StringType(), True),
        StructField('review_body', StringType(), True),
        StructField('review_date', StringType(), True)
    ])

    df_csv = spark.read.csv(path=s3_input_data,
                            sep='\t',
                            schema=schema,
                            header=True,
                            quote=None)
    df_csv.show()

    # This dataset should already be clean, but always good to double-check
    print('Showing null review_body rows...')
    df_csv.where(col('review_body').isNull()).show()

    df_csv_cleaned = df_csv.na.drop(subset=['review_body'])
    df_csv_cleaned.where(col('review_body').isNull()).show()

    tokenizer = Tokenizer(inputCol='review_body', outputCol='words')
    wordsData = tokenizer.transform(df_csv_cleaned)

    hashingTF = HashingTF(inputCol='words',
                          outputCol='raw_features',
                          numFeatures=1000)
    featurizedData = hashingTF.transform(wordsData)

    # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:
    # 1) compute the IDF vector
    # 2) scale the term frequencies by IDF
    # Therefore, we cache the result of the HashingTF transformation above to speed up the 2nd pass
    featurizedData.cache()

    # spark.mllib's IDF implementation provides an option for ignoring terms
    # which occur in less than a minimum number of documents.
    # In such cases, the IDF for these terms is set to 0.
    # This feature can be used by passing the minDocFreq value to the IDF constructor.
    idf = IDF(inputCol='raw_features', outputCol='features')  #, minDocFreq=2)
    idfModel = idf.fit(featurizedData)
    features_df = idfModel.transform(featurizedData)
    features_df.select('star_rating', 'features').show()

    num_features = 300
    pca = PCA(k=num_features, inputCol='features', outputCol='pca_features')
    pca_model = pca.fit(features_df)
    pca_features_df = pca_model.transform(features_df).select(
        'star_rating', 'pca_features')
    pca_features_df.show(truncate=False)

    standard_scaler = StandardScaler(inputCol='pca_features',
                                     outputCol='scaled_pca_features')
    standard_scaler_model = standard_scaler.fit(pca_features_df)
    standard_scaler_features_df = standard_scaler_model.transform(
        pca_features_df).select('star_rating', 'scaled_pca_features')
    standard_scaler_features_df.show(truncate=False)

    expanded_features_df = (standard_scaler_features_df.withColumn(
        'f', to_array(col('scaled_pca_features'))).select(
            ['star_rating'] + [col('f')[i] for i in range(num_features)]))
    expanded_features_df.show()

    train_df, validation_df, test_df = expanded_features_df.randomSplit(
        [0.9, 0.05, 0.05])

    train_df.write.csv(path=s3_output_train_data, header=None, quote=None)
    print('Wrote to output file:  {}'.format(s3_output_train_data))

    validation_df.write.csv(path=s3_output_validation_data,
                            header=None,
                            quote=None)
    print('Wrote to output file:  {}'.format(s3_output_validation_data))

    test_df.write.csv(path=s3_output_test_data, header=None, quote=None)
    print('Wrote to output file:  {}'.format(s3_output_test_data))
def tokenize(inputDF):
    tokenizer = Tokenizer(inputCol='sentences', outputCol='tokenizedwords')
    tokenized = tokenizer.transform(inputDF)
    return tokenized
Esempio n. 34
0
train_datafile = get_args().input
train_df = spark.read.csv(train_datafile,header=True,sep='\t').limit(80000)

# using 1000 records as a small set debugging data
train_sents1 = train_df.select('genre', 'sentence1')
train_sents2 = train_df.select('genre', 'sentence2')
# train_sents1.show(5)

udf_lower = F.udf(lower_folding, StringType() )
train_sents1_lower = train_sents1.withColumn('lower_sents', udf_lower('sentence1') )
# train_sents1_lower.show(5)

udf_rv_punc = F.udf(remove_punctuation_re, StringType() )
train_sents1_rv_punc = train_sents1_lower.withColumn('rv_punc_sents', udf_rv_punc('lower_sents') )

tokenizer = Tokenizer(inputCol="rv_punc_sents", outputCol="tokens")
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")
w2v = Word2Vec(vectorSize=300, minCount=0, inputCol="filtered_tokens", outputCol="avg_word_embed")

doc2vec_pipeline = Pipeline(stages=[tokenizer,remover,w2v])
doc2vec_model = doc2vec_pipeline.fit(train_sents1_rv_punc)
doc2vecs_df = doc2vec_model.transform(train_sents1_rv_punc)
w2v_train_df, w2v_test_df = doc2vecs_df.randomSplit([0.8, 0.2])

from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

genre2label = StringIndexer(inputCol="genre", outputCol="label")
rf_classifier = MultilayerPerceptronClassifier(labelCol="label", featuresCol="avg_word_embed")
Esempio n. 35
0
lines=sc.textFile("/Users/admin/Desktop/KBSApp/KBSApp/permissionsData/dataSets/SVMDataSet.txt")

parts = lines.map(lambda l: l.split(","))
f = parts.map(lambda p: Row(tindex=int(p[0]),packageName=p[1],packagePermissions=p[2], label= int(float(p[3])),training=1))


linest = sc.textFile("/Users/admin/Desktop/KBSApp/KBSApp/permissionsData/dataSets/SVMDataGroundTruth.txt")
partst = linest.map(lambda l: l.split(","))
ft = partst.map(lambda p: Row(tindex=int(p[0]),packageName=p[1],packagePermissions=p[2],label= int(float(p[3])),training=0))
alldata = f.union(ft)

schemaApp = sqlContext.createDataFrame(alldata)

schemaApp.registerTempTable("data")

tokenizer = Tokenizer(inputCol="packagePermissions", outputCol="perms")
permsData = tokenizer.transform(schemaApp)

hashingTF = HashingTF(inputCol="perms", outputCol="rawFeatures")
featurizedData = hashingTF.transform(permsData)


idf = IDF(inputCol="rawFeatures", outputCol="features")


idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

wordsvectors = rescaledData["label","features"].map(lambda row: LabeledPoint(row[0], row[1]))
model = LogisticRegressionWithLBFGS.train(wordsvectors, iterations=100)
# Count number of Words in each Text
from pyspark.sql.functions import length
data = data.withColumn('length', length(data['text']))
data.show()

# In[3]:
# Compare the lenght difference between ham and spam
data.groupby('class').mean().show()

# In[4]:
# Treat TF-IDF features for each text
# TF: Term Frequency
# IDF: Inverse Document Frequency
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer, VectorAssembler

tokenizer = Tokenizer(inputCol="text", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
count_vec = CountVectorizer(inputCol='stop_tokens',outputCol='c_vec')
idf = IDF(inputCol="c_vec", outputCol="tf_idf")
ham_spam_to_num = StringIndexer(inputCol='class',outputCol='label')
final_feature = VectorAssembler(inputCols=['tf_idf', 'length'],outputCol='features')

from pyspark.ml import Pipeline
data_prep_pipe = Pipeline(stages=[ham_spam_to_num,tokenizer,stopremove,count_vec,idf,final_feature])
clean_data = data_prep_pipe.fit(data).transform(data)

clean_data.show()
clean_data.take(1)
clean_data.take(1)[0][-1]

Esempio n. 37
0
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("TfIdfExample")\
        .getOrCreate()

    # $example on$
    sentenceData = spark.createDataFrame([
        (0, "Hi I heard about Spark"),
        (0, "I wish Java could use case classes"),
        (1, "Logistic regression models are neat")
    ], ["label", "sentence"])

    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
    wordsData = tokenizer.transform(sentenceData)

    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    # alternatively, CountVectorizer can also be used to get term frequency vectors

    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    for features_label in rescaledData.select("features", "label").take(3):
        print(features_label)
    # $example off$

    spark.stop()
Esempio n. 38
0
def create_w2v_model():
    spark = SparkSession \
        .builder \
        .appName("SimpleApplication") \
        .config("spark.executor.memory", "2g") \
        .config("spark.driver.memory", "2g") \
        .config("spark.memory.offHeap.enabled", True) \
        .config("spark.memory.offHeap.size", "2g") \
        .getOrCreate()

    input_file = spark.sparkContext.wholeTextFiles(PATH)

    print("""
    
    Подготовка данных (1)...
    
    """)
    prepared_data = input_file.map(lambda x: (x[0], remove_punctuation(x[1])))

    print("""
    
    Подготовка данных (2)...
    
    """)
    df = prepared_data.toDF()

    print("""
    
    Подготовка данных (3)...
    
    """)
    prepared_df = df.selectExpr('_2 as text')

    print("""
    
    Разбитие на токены...
    
    """)
    tokenizer = Tokenizer(inputCol='text', outputCol='words')
    words = tokenizer.transform(prepared_df)

    print("""
    
    Очистка от стоп-слов...
    
    """)
    stop_words = StopWordsRemover.loadDefaultStopWords('russian')
    remover = StopWordsRemover(inputCol="words",
                               outputCol="filtered",
                               stopWords=stop_words)

    print("""
    
    Построение модели...
    
    """)
    word2Vec = Word2Vec(vectorSize=50,
                        inputCol='words',
                        outputCol='result',
                        minCount=2)
    model = word2Vec.fit(words)

    print("""
    
    Сохранение модели...
    
    """)
    today = datetime.datetime.today()
    model_name = today.strftime("model/kurs_model")
    print("""
    
    Model  """ + model_name + """  saved
    
    """)
    model.save(model_name)

    spark.stop()

# COMMAND ----------

summary = model.summary
print model.weights
model.gaussiansDF.show()
summary.cluster.show()
summary.clusterSizes
summary.probability.show()


# COMMAND ----------

from pyspark.ml.feature import Tokenizer, CountVectorizer
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn.transform(sales.drop("features"))
cv = CountVectorizer()\
  .setInputCol("DescOut")\
  .setOutputCol("features")\
  .setVocabSize(500)\
  .setMinTF(0)\
  .setMinDF(0)\
  .setBinary(True)
cvFitted = cv.fit(tokenized)
prepped = cvFitted.transform(tokenized)


# COMMAND ----------

from pyspark.ml.clustering import LDA
Esempio n. 40
0
    for sentence in review:
        word_tokens = word_tokenize(sentence)
        for w in word_tokens:
            if w not in stop_words:
                w = ps.stem(w)
                final = final + " " + w
        filtered_sentence.append(final)
        final = ""

    review = filtered_sentence
    # print("\n \n -----------------------------------------------------------------------------------------------------: \n " +  str(review))

    # doing the bag of words algorithm
    dup_vector = zip(calification, review)
    sentenceData = spark.createDataFrame(dup_vector, ["label", "sentence"])
    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
    wordsData = tokenizer.transform(sentenceData)
    hashingTF = HashingTF(
        inputCol="words", outputCol="rawFeatures", numFeatures=100
    )  # numFeatures is the distincts diferents  words that there are in the document, would be a good thing to do a wordcount here
    featurizedData = hashingTF.transform(wordsData)
    # alternatively, CountVectorizer can also be used to get term frequency vectors
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    # rescaledData.select("label", "features").show(20,False) # to show dataframe structure

    # print(len(review)) # printing the size of both arrays for indexing acknolegement
    # print(len(calification))

    #print(review)  # just to test what does the array have
Esempio n. 41
0
import argparse

from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, Row
from pyspark.ml.feature import Tokenizer

def filter_comments(df):
    return df.filter(df['author'] != '[deleted]') \
             .filter(df['body'] != '[deleted]') \
             .filter(df['body'] != '[removed]')

if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Reddit Comment Prediction')
    parser.add_argument('-i', '--input_file', type=str, 
        help="""The CSV input data file that contains the raw comment data""")
    args = parser.parse_args()

    sc = SparkContext("local", "Prediction")
    sqlContext = SQLContext(sc)
    df = sqlContext.read.json(args.input_file)
    print 'Loaded input file {} with {} total comments'.format(args.input_file, df.count())

    filtered = filter_comments(df)
    print '{} comments after filtering'.format(filtered.count())

    tokenizer = Tokenizer(inputCol="body", outputCol="words")
    wordsDataFrame = tokenizer.transform(filtered)
    wordsDataFrame.select("body", "words").show() 
Esempio n. 42
0
    def clustering(self, columns='*', num_cluster=2, n_g=2):
        """
        input
            @ a column list and remove special char column
            @ number of cluster
            @ number of n_gram
        return:
            @ a data frame with clustering column
        """

        # let text data as string with blank space
        n_gr = n_g
        data_frame_1 = self._df
        # check data type, if not string turn it into string
        valid_cols = [col for (col, typ) in filter(lambda typ: typ[1] == 'string', self._df.dtypes)]
        # turn it into string
        if columns not in valid_cols:
            data_frame_1 = data_frame_1.withColumn(columns + '_', data_frame_1[columns].cast("string"))
            data_frame_1 = data_frame_1.drop(columns)
            data_frame_1 = data_frame_1.withColumnRenamed(columns + '_', columns)

        # make the string toknizable
        udf_space = udf(lambda z: " ".join(z))
        data_frame_1 = data_frame_1.withColumn(columns + '_split', udf_space(columns)).orderBy(columns)
        # Token the word and do the bi-gram
        tokenizer = Tokenizer(inputCol=columns + '_split', outputCol=columns + "_token")
        data_frame_2 = tokenizer.transform(data_frame_1)
        # make features like n-gram
        ngram = NGram(n=n_gr, inputCol=columns + "_token", outputCol=columns + "_ngram")
        ngramDataFrame = ngram.transform(data_frame_2)
        # vectorization: text map to vector
        cv = CountVectorizer(inputCol=columns + "_ngram", outputCol="features", vocabSize=10, minDF=1.0)
        # fit the vectorization
        model = cv.fit(ngramDataFrame)
        result = model.transform(ngramDataFrame)


        # setup kmeans
        kmeans = KMeans().setK(num_cluster).setSeed(1)
        model_kmean = kmeans.fit(result)
        predictions_kmean = model_kmean.transform(result)

        df = predictions_kmean.orderBy('prediction', ascending=True).select(self._df.schema.names + ['prediction'])
        # reshape the table, user are easy to read it
        print('show each count numbers in this row')
        temp = df.groupBy(columns, 'prediction').count()
        temp = temp.withColumnRenamed('prediction', 'cluster')
        df = df.withColumnRenamed('prediction', 'cluster')
        temp = temp.withColumnRenamed('count', 'count in cluster')
        temp.show()

        # show the cluster number
        window = Window.partitionBy("cluster").orderBy(col("count in cluster").desc())
        test = (temp.withColumn('row_num', F.row_number().over(window)).where(F.col('row_num') == 1).select(columns, 'cluster'))
        print('Defult replace: replace the mode of a instance each cluster')
        test.orderBy('cluster', ascending=True).show()

        # turn the modest number to list
        test_list = test.select(columns).orderBy('cluster').collect()

        # name_list = [i.columns for i in test_list]
        name_list = [i[columns] for i in test_list]

        list_setting = input("Type 'yes' to enter customized replace words or press any key for default replace setting: \n")
        # let the usr defined the replaced word
        count = 0
        if list_setting == 'yes':
            count = 0
            while(count < num_cluster):
                usr_replace = input('Enter the cluster {0} shoud be, or press enter to skip: \n'.format(count))
                if usr_replace != '':
                    name_list[count] = usr_replace
                else:
                    name_list[count] = name_list[count]
                count += 1

        # replace the words
        udf_place_name = udf(lambda z: name_list[z])
        data_frame_replace = df.withColumn('replace_' + columns, udf_place_name('cluster'))

        replace_input = input('type yes to replace origin column, press any key to keep the origin column:\n')
        # change the origin dataframe
        if replace_input == 'yes':
            data_frame_replace = data_frame_replace.drop(columns)
            data_frame_replace = data_frame_replace.withColumnRenamed("replace_" + columns, columns)

        data_frame_replace = data_frame_replace.drop('cluster')
        #  replace the origin dataframe
        self._df = data_frame_replace
        #  show result to users
        self._df.show()
        return self
Esempio n. 43
0
def cleanLower(doc):
    return doc.replace("<br /><br />"," ").lower()
rdd = labeledRdd.map(lambda doc : (cleanLower(doc[0]),doc[1]))

print "Text is cleaned"


sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ['review', 'label'])
dfTrain, dfTest = df.randomSplit([0.8,0.2])

print "Random split is done"


tokenizer = Tokenizer(inputCol='review', outputCol='reviews_words')
hashing_tf = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol='reviews_tf')
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10)

pipeline = Pipeline(stages=[tokenizer,
                            hashing_tf,
                            idf,
                            string_indexer,
                            dt])

evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')

# grid=(ParamGridBuilder()
#      .baseOn([evaluator.metricName,'precision'])
Esempio n. 44
0
from pyspark.sql import SQLContext
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

conf = SparkConf().setAppName("MLPipeline")
sc = SparkContext(conf=conf)

# Read training data as a DataFrame
sqlCt = SQLContext(sc)
trainDF = sqlCt.read.parquet("20news_train.parquet")

# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                      outputCol="features",
                      numFeatures=1000)
lr = LogisticRegression(maxIter=20, regParam=0.1)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

# Fit the pipeline to training data.
model = pipeline.fit(trainDF)

#Building the cross validation model
paramGrid = (ParamGridBuilder().addGrid(
    hashingTF.numFeatures, [1000, 5000, 10000]).addGrid(
        lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]).build())

crossval = CrossValidator(estimator=pipeline,
Esempio n. 45
0
df2 = df.withColumn('date', F.unix_timestamp('date', form).cast('timestamp'))
print(df2.show(5))

df = df2

import matplotlib.pyplot as plt
dates = df.select(F.date_format(
    'date',
    'yyyy-MM-dd').alias('no_timestamp')).groupby('no_timestamp').count().sort(
        F.col('no_timestamp'))
print(dates.show(dates.count()))
dates.toPandas().plot(kind='line', x='no_timestamp', y='count')

dates.toPandas().plot(kind='bar', x='no_timestamp')

tokenizer = Tokenizer(inputCol="tweet", outputCol="words")
prep_df = tokenizer.transform(df)
cv_prep = CountVectorizer(inputCol="words", outputCol="prep")
cv_model = cv_prep.fit(prep_df)
ready_df = cv_model.transform(prep_df)
# stopWords = [word for word in cv_prep.vocabulary if any(char.isdigit() for char in word)]
# remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords = stopwords)
# prep_df = remover.transform(prep_df)

trainable = ready_df.select(
    'tweet_id', 'prep').rdd.map(lambda x, y: [x, Vectors.fromML(y)]).cache()
print("Trainable")
print(trainable.take(10))
print("take")
model = LDA.train(trainable, k=5, seed=1, optimizer="online")
exit(0)
Esempio n. 46
0
from pyspark.sql import SparkSession

if __name__ == "__main__":
    sparkSession = SparkSession\
        .builder\
        .getOrCreate()

    # Prepare training documents from a list of (id, text, label) tuples.
    training = sparkSession.createDataFrame([(0, "a b c d e spark", 1.0),
                                             (1, "b d", 0.0),
                                             (2, "spark f g h", 1.0),
                                             (3, "hadoop mapreduce", 0.0)],
                                            ["id", "text", "label"])

    # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and logistic regression.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")

    tk = tokenizer.transform(training)
    tk.printSchema()
    tk.show()

    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                          outputCol="features")
    hs = hashingTF.transform(tk)
    hs.printSchema()
    hs.show()

    logistic_regression = LogisticRegression(maxIter=10, regParam=0.001)

    pipeline = Pipeline(stages=[tokenizer, hashingTF, logistic_regression])
Esempio n. 47
0
        return list()
    return [lemmtizer.lemmatize(word) for word in input_list]


spark = SparkSession.builder.appName("TfIdf-Lemmetization").getOrCreate()

lemmetize = F.udf(lemmetize)
# spark.udf.register("lemmetize", lemmetize)

documents = spark.read.text("dataset/*.txt")
documents = documents.withColumn("doc_id",
                                 F.row_number().over(Window.orderBy('value')))

documents.printSchema()
# creating tokens/words from the sentence data
tokenizer = Tokenizer(inputCol="value", outputCol="words")
wordsData = tokenizer.transform(documents)
wordsData.show()

stemmer = SnowballStemmer(language='english')
stemmer_udf = F.udf(lambda tokens: [stemmer.stem(token) for token in tokens],
                    ArrayType(StringType()))
wordsData = wordsData.withColumn("lemms", stemmer_udf("words"))

# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3,
                    minCount=0,
                    inputCol="lemms",
                    outputCol="result")
model = word2Vec.fit(wordsData)
result = model.transform(wordsData)
Esempio n. 48
0
dataSet = dataSet.withColumn('cleanReview', cleanText(
    F.col('reviews'))).filter(F.col('cleanReview') != '')
dataSet.show()

# %%
dataSet = dataSet.withColumn('class', dataSet['class'].cast(IntegerType()))
dataSet = dataSet.select('class', 'cleanReview').withColumnRenamed(
    'cleanReview', 'reviews')

# %%
trainDF, testDF = dataSet.randomSplit([0.8, 0.2])
trainDF.show()
testDF.show()

# %%
tokenizer = Tokenizer(inputCol="reviews", outputCol="tokens")
countVector = CountVectorizer(inputCol=tokenizer.getOutputCol(),
                              outputCol='features')
idf = IDF(inputCol=countVector.getOutputCol(), outputCol='idf')
pipeline = Pipeline(stages=[tokenizer, countVector, idf])
pipelineModel = pipeline.fit(trainDF)

# %%
pTrainDF = pipelineModel.transform(trainDF)
pTestDF = pipelineModel.transform(testDF)

# %%
evaluator = MulticlassClassificationEvaluator(labelCol="class",
                                              predictionCol="prediction",
                                              metricName="f1")
lr = LogisticRegression(featuresCol=idf.getOutputCol(), labelCol='class')
Esempio n. 49
0
from pyspark.sql import SQLContext
from pyspark.sql.functions import desc, explode
from pyspark.sql.types import *
from storage import Sqlite

PARTITIONS = 500
THRESHOLD = 50

if __name__ == "__main__":
    conf = SparkConf().setAppName("reddit")
    conf.set('spark.serializer', 'org.apache.spark.serializer.KryoSerializer')
    conf.set('spark.local.dir', '/mnt/work')
    conf.set('spark.driver.maxResultSize', '12g')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    fields = [StructField("subreddit", StringType(), True),
          StructField("body", StringType(), True)]
    rawDF = sqlContext.read.json("file:///mnt/s3/2015/*", StructType(fields))
    # split comments into words
    tokenizer = Tokenizer(inputCol="body", outputCol="words")
    wordsDataFrame = tokenizer.transform(rawDF)

    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    filteredDataFrame = remover.transform(wordsDataFrame)
    # explode terms into individual rows
    termDataFrame = filteredDataFrame.select(['subreddit', explode(filteredDataFrame.filtered).alias("term")])
    # group by subreddit and term, then count occurence of term in subreddits
    countsDataFrame = termDataFrame.groupBy(['subreddit', 'term']).count()

    db =  Sqlite()
    countsDataFrame.select(['subreddit', 'term', 'count']).filter('count > {}'.format(THRESHOLD)).foreachPartition(db.saveSubredditWords)
Esempio n. 50
0
labeledRdd = sc.parallelize(labeledData)


from pyspark.sql import SQLContext 
def preProcess(doc):
    clean = doc.replace("<br /><br />"," ")
    return clean.lower()
rdd = labeledRdd.map(lambda doc : (preProcess(doc[0]),doc[1]))

sqlContext = SQLContext(sc)

df = sqlContext.createDataFrame(rdd, ['review', 'label'])
dfTrain, dfTest = df.randomSplit([0.8,0.2])

from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer(inputCol='review', outputCol='words')
dfTrainTok = tokenizer.transform(dfTrain)

import itertools
lists=dfTrainTok.map(lambda r : r.review).collect()
dictWords=set(itertools.chain(*lists))
dictionaryWords={}
for i,word in enumerate(dictWords):
	dictionaryWords[word]=i

dict_broad=sc.broadcast(dictionaryWords)

from pyspark.mllib.linalg import SparseVector
def vectorize(row,dico):
    vector_dict={}
    for w in row.words:
Esempio n. 51
0
data = df6.select(
    'id',
    (lower(regexp_replace('comment_text', "[^a-zA-Z\\s]", "")).alias('text')),
    'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate')
data = data.select('id', (regexp_replace('text', "[\r\n]+", "").alias('text')),
                   'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
                   'identity_hate')
data.na.drop()
data.na.fill(0)
clean = data.where(col('toxic').isNotNull()).where(
    col('severe_toxic').isNotNull()).where(col('obscene').isNotNull()).where(
        col('threat').isNotNull()).where(col('insult').isNotNull()).where(
            col('identity_hate').isNotNull())

# Token Parser
tokenizer = Tokenizer(inputCol="text", outputCol="words")
wordToken = tokenizer.transform(clean)

# Delete Stop Words
remover = StopWordsRemover(inputCol='words', outputCol='words_clean')
dataFrameNoStop = remover.transform(wordToken)

# Term Frequency
hashTermFreq = HashingTF(inputCol="words_clean", outputCol="rawFeatures")
termFreq = hashTermFreq.transform(dataFrameNoStop)

# Term Frequency ID Frequency
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(termFreq)
tfidf = idfModel.transform(termFreq).select('features', 'toxic',
                                            'severe_toxic', 'obscene',
    review_text = BeautifulSoup(raw_review).text
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                                            
    # 
    # 4. Remove stop words
    meaningful_words =  [w for w in words if not w in stops]   
    #
    # 5. Join the words back into one string separated by space, 
    # and return the result.
    return " ".join( meaningful_words)   

stops = set(stopwords.words("english")) 
lines = sc.textFile("s3://spark-project-data/labeledTrainData.tsv")
rows = lines.zipWithIndex().filter(lambda (row,index): index > 0).keys()
parts = rows.map(lambda l: l.split("\t"))

review = parts.map(lambda p: Row(id=p[0], label=float(p[1]), 
	review=review_to_words(p[2])))
schemeReview = sqlContext.createDataFrame(review)
tokenizer = Tokenizer(inputCol="review", outputCol="words")
wordsData = tokenizer.transform(schemeReview)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=300)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
selectData = rescaledData.select("label","features")
Esempio n. 53
0
print('\n', sms.dtypes, '\n')

sms.printSchema()

# Remove punctuation (REGEX provided) and numbers
wrangled = sms.withColumn('text',
                          regexp_replace(sms.text, '[_():;,.!?\\-]', ' '))
wrangled = wrangled.withColumn('text',
                               regexp_replace(wrangled.text, '[0-9]', ' '))

# Merge multiple spaces
wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, ' +',
                                                      ' '))

# Text to tokens
wrangled = Tokenizer(inputCol="text", outputCol="words").transform(wrangled)

# Remove stop words.
wrangled = StopWordsRemover(inputCol="words",
                            outputCol="terms").transform(wrangled)

# Apply the hashing trick
wrangled = HashingTF(inputCol="terms", outputCol="hash",
                     numFeatures=1024).transform(wrangled)

# Convert hashed symbols to TF-IDF
sms = IDF(inputCol="hash",
          outputCol="features").fit(wrangled).transform(wrangled)

# View the first four records
sms.show(4, truncate=False)
Esempio n. 54
0
# limitations under the License.
#

from __future__ import print_function

from pyspark import SparkContext
from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import Tokenizer, RegexTokenizer
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="TokenizerExample")
    sqlContext = SQLContext(sc)

    # $example on$
    sentenceDataFrame = sqlContext.createDataFrame([
        (0, "Hi I heard about Spark"),
        (1, "I wish Java could use case classes"),
        (2, "Logistic,regression,models,are,neat")
    ], ["label", "sentence"])
    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
    wordsDataFrame = tokenizer.transform(sentenceDataFrame)
    for words_label in wordsDataFrame.select("words", "label").take(3):
        print(words_label)
    regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")
    # alternatively, pattern="\\w+", gaps(False)
    # $example off$

    sc.stop()
auc = binary_evaluator.evaluate(prediction, {binary_evaluator.metricName: "areaUnderROC"})

# Turning Text into Tables
# term-document text
# A selection of children's books
books.show(truncate=False)

# Removing punctuation
from pyspark.sql.functions import regexp_replace
# Regular expression (REGEX) to match commas and hyphens
REGEX = '[,\\-]'
books = books.withColumn('text', regexp_replace(books.text, REGEX, ' '))

# Text to tokens
from pyspark.ml.feature import Tokenizer
books = Tokenizer(inputCol='text', outputCol='tokens').transform(books)

# Remove stop words
from pyspark.ml.feature import StopWordsRemover
stopwords = StopWordsRemover()
# Take a look at the list of stop words
stopwords.getStopWords()
# Spevify the input and output column names
stopwords = stopwords.setInputCol('tokens').setOutputCol('words')
books = stopwords.transform(books)

# Feature hashing
from pyspark.ml.feature import HashingTF
hasher = HashingTF(inputCol='words', outputCol='hash', numFeatures=32)
books = hasher.transoform(books)
Esempio n. 56
0
print "Create dataframe"
t0 = time()
df = sqlContext.createDataFrame(rdd, ['review', 'label'])
print "Showing first example : "
print
print df.first()
tt = time() - t0
print
print "Dataframe created in {} second".format(round(tt,3))


# In[314]:

from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer(inputCol='review', outputCol='words')
dfTok = tokenizer.transform(df)


# In[315]:

from pyspark.ml.feature import NGram
bigram = NGram(inputCol="words", outputCol="bigrams")
dfBigram = bigram.transform(dfTok)


# In[317]:

print "Start tokenizing, computing bigrams and splitting between test and train"
t0 = time()
dfTrain, dfTest = dfBigram.randomSplit([0.8,0.2])
Esempio n. 57
0
from pyspark.ml.feature import Tokenizer
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("tokenizer_sample") \
    .master("local[*]") \
    .getOrCreate()

data = [(0, "Tokenization is the process"), (1, "Refer to the Tokenizer")]
inputDF = spark.createDataFrame(data).toDF("id", "input")
tokenizer = Tokenizer(inputCol="input", outputCol="output")
outputDF = tokenizer.transform(inputDF)
outputDF.printSchema()
outputDF.show()

spark.stop
	for category_dir in listdir(input_dir): # Build the dataset of (docname, category, wordcounts) tuples
		distinct_labels[curr_cat] = category_dir
		next_docs = sc.wholeTextFiles(('/').join([input_dir, category_dir])) 
		docs = docs.union(next_docs.map(lambda (doc, lines): (format_text(lines), float(curr_cat))))
		curr_cat += 1
	
	training_rows = docs.sample(False, train_fraction)
	testing_rows = docs.subtract(training_rows)
	
	# Prepare training and test documents, which are labeled.
	LabeledDocument = Row("text", "label")
	train = training_rows.map(lambda x: LabeledDocument(*x)).toDF()
	test = testing_rows.map(lambda x: LabeledDocument(*x)).toDF()		

    # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
	tokenizer = Tokenizer(inputCol="text", outputCol="words")
	hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="rawFeatures") #outputCol="features")
	idf = IDF(inputCol="rawFeatures", outputCol="features")
	
	lr = LogisticRegression(maxIter=1000, regParam=0.001)
	#pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
	p0 = Pipeline(stages=[tokenizer, hashingTF, idf ,lr])
	#m0 = p0.fit(train)
	#pipeline = Pipeline(stages=[m0, lr])
	pipeline = p0
	
	# Fit the pipeline to training documents.
	model = pipeline.fit(train)
	print('\n\n --------------- RESULT ----------------------\n\n')
	print(model.transform(test).head())
	print('\n\n ---------------------------------------------\n\n')
Esempio n. 59
0
from __future__ import print_function
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from pyspark.sql import SparkSession

if __name__ == '__main__':
    spark = SparkSession\
        .builder\
        .appName('Tokenizer')\
        .getOrCreate()

    sentenceDataFrame = spark.createDataFrame([
        (0, "Hi I heard about Spark"),
        (1, "I wish Java could use case classes"),
        (2, "Logistic,regression,models,are,neat")
    ], ["id", "sentence"])

    tokenizer = Tokenizer(inputCol = 'sentence', outputCol='words')
    regexTokenizer = RegexTokenizer(inputCol='sentence', outputCol='words', pattern = "\\W")
    countTokens = udf(lambda words: len(words), IntegerType())
    tokenized = tokenizer.transform(sentenceDataFrame)
    tokenized.select('sentence','words')\
        .withColumn('tokens', countTokens(col('words'))).show(truncate=False)

    regexTokenized = regexTokenizer.transform(sentenceDataFrame)
    regexTokenized.select('sentence', 'words') \
        .withColumn('tokens', countTokens(col('words'))).show(truncate=False)

    spark.stop