Beispiel #1
0
##modeling
api_f = [
    'attributes.RestaurantsPriceRange2', 'business_id', 'stars',
    'review_count', 'categories'
]

cv = CountVectorizer(minDF=10,
                     vocabSize=5000,
                     inputCol='token',
                     outputCol='vectors')
km1 = KMeans(k=20, featuresCol='vectors', maxIter=30)

pipe_count = Pipeline(stages=[cv, km1])

idf = IDF(inputCol="vector", outputCol="features")
km2 = KMeans(k=20, featuresCol='features', maxIter=30)
pipe_idf = Pipeline(stages=[cv, idf, km2])

###fitting
#train_vect = data_tokenizer(dataset)
#model_cv_km = pipe_count.fit(train_vect)

#model_tf_km = pipe_count.fit(train_vect)


def cluster_user_by_review(data_review, model):
    pred = model.transform(data_review)
    data = pred.select('user_id',
                       'prediction').withColumnRenamed('prediction', 'user_cl')
    data = data.dropDuplicates()
data_df = spark.createDataFrame(typed_rdd, ["text", "label"])
data_set = data_df.select(data_df['label'], data_df['text'])
#splitting data to train and test
training_df, test_df = data_set.randomSplit([0.7, 0.3])
training_df.head(5)

from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator

tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
idf = IDF(minDocFreq=3, inputCol="features", outputCol="idf")
nb = NaiveBayes()
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, nb])

paramGrid = ParamGridBuilder().addGrid(nb.smoothing, [0.0, 1.0]).build()

cv = CrossValidator(estimator=pipeline,
                    estimatorParamMaps=paramGrid,
                    evaluator=MulticlassClassificationEvaluator(),
                    numFolds=4)

cvModel = cv.fit(training_df)

result = cvModel.transform(test_df)
prediction_df = result.select("text", "label", "prediction")
Beispiel #3
0
    wordsData = tokenizer.transform(clean_jobs)
    #   remove stopwords
    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    filteredWords = remover.transform(wordsData)

    #   get ngrams
    ngram = NGram(inputCol="filtered", outputCol="featureGrams")
    gramData = ngram.transform(filteredWords)

    #   create TFIDF of these
    hashingTF = HashingTF(inputCol="featureGrams",
                          outputCol="rawFeatures",
                          numFeatures=350)
    featurizedData = hashingTF.transform(gramData)

    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    rescaledData.cache()
    rescaledData.count()

    #   now filter all the criteria you care about:
    ds_preds = rescaledData.where(
        (col("title").like("%Machine Learn%")) |
        (col("title").like("%Data Scientist%")) |
        (col("title").like("%Artificial Intel%")) |
        (col("title").like("%Analytic%")) | (col("title").like("%Statist%")) |
        (col("title").like("%ML%")) | (col("title").like("%AI%")) |
        (col("title").like("%Data Engin%")) |
        (col("title").like("%Programmer%")))  #analytics cluster
# 1.Tokenize the title, ignore emoji and etc. regular expression
title_tokenizer = RegexTokenizer(inputCol='title',
                                 outputCol='title_words',
                                 pattern='\\W',
                                 toLowercase=True)

# 2.Remove stopwords from title
title_sw_remover = StopWordsRemover(inputCol='title_words',
                                    outputCol='title_sw_removed')

# 3.Compute Term frequency from title
title_count_vectorizer = CountVectorizer(inputCol='title_sw_removed',
                                         outputCol='tf_title')

# 4.Compute TF-IDF from title
title_tfidf = IDF(inputCol='tf_title', outputCol='tf_idf_title')

# 5.Tokenize the text, ignore emoji and etc. regular expression
text_tokenizer = RegexTokenizer(inputCol='text',
                                outputCol='text_words',
                                pattern='\\W',
                                toLowercase=True)

# 6.Remove stopwords from text
text_sw_remover = StopWordsRemover(inputCol='text_words',
                                   outputCol='text_sw_removed')

# 7.Compute Term frequency from text
text_count_vectorizer = CountVectorizer(inputCol='text_sw_removed',
                                        outputCol='tf_text')
Beispiel #5
0
# list of stopwords to be removed from the posts
StopWords = list(set(stopwords.words('english')))

labelIndexer = StringIndexer(inputCol="tags", outputCol="label").fit(train)
bs_text_extractor = BsTextExtractor(inputCol="post", outputCol="untagged_post")
RegexTokenizer = RegexTokenizer(inputCol=bs_text_extractor.getOutputCol(),
                                outputCol="words",
                                pattern="[^0-9a-z#+_]+")
StopwordRemover = StopWordsRemover(
    inputCol=RegexTokenizer.getOutputCol(),
    outputCol="filtered_words").setStopWords(StopWords)
CountVectorizer = CountVectorizer(inputCol=StopwordRemover.getOutputCol(),
                                  outputCol="countFeatures",
                                  minDF=5)
idf = IDF(inputCol=CountVectorizer.getOutputCol(), outputCol="features")
rf = RandomForestClassifier(labelCol="label",
                            featuresCol=idf.getOutputCol(),
                            numTrees=100,
                            maxDepth=4)
idx_2_string = IndexToString(inputCol="prediction", outputCol="predictedValue")
idx_2_string.setLabels(labelIndexer.labels)

# creating the pipeline
pipeline = Pipeline(stages=[
    labelIndexer, bs_text_extractor, RegexTokenizer, StopwordRemover,
    CountVectorizer, idf, rf, idx_2_string
])

# fitting the model
model = pipeline.fit(train)
Beispiel #6
0
    inferSchema=True,
    sep='\t')

data.show()
data = data.withColumnRenamed('_c0', 'class').withColumnRenamed('_c1', 'text')

data.show()

data = data.withColumn('length', length(data['text']))

data.show()

tokenizer = Tokenizer(inputCol='text', outputCol='tokens')
stop_remove = StopWordsRemover(inputCol='tokens', outputCol='stop_tokens')
count_vec = CountVectorizer(inputCol='stop_tokens', outputCol='count_vec')
idf = IDF(inputCol='count_vec', outputCol='tf_idf')

label_index = StringIndexer(inputCol='class', outputCol='label')

clean_up = VectorAssembler(inputCols=['tf_idf', 'length'],
                           outputCol='features')

nb = NaiveBayes()

data_pipe = Pipeline(
    stages=[label_index, tokenizer, stop_remove, count_vec, idf, clean_up])

cleaned = data_pipe.fit(data).transform(data)

cleaned.show()
Beispiel #7
0
    # import the raw data from the dataset
    data = spark.read.csv('data/training.1600000.processed.noemoticon.csv',
                          inferSchema=True)
    data = data.select(['_c0', '_c5']).withColumnRenamed(
        '_c0', 'class').withColumnRenamed('_c5', 'text')

    # build the preprocessing pipeline
    # change label value from 0, 4 to 0, 1
    stringIndexer = StringIndexer(inputCol='class', outputCol='label')
    tokenizer = Tokenizer(inputCol='text', outputCol='tokens')
    stopwordsRemover = StopWordsRemover(inputCol='tokens',
                                        outputCol='tokens_filtered')
    countVectorizer = CountVectorizer(inputCol='tokens_filtered',
                                      outputCol='count_vec')
    # hashTF = HashingTF(numFeatures=2**16, inputCol="tokens_nonstop", outputCol='tf')
    idf = IDF(inputCol='count_vec', outputCol='features',
              minDocFreq=5)  # minDocFreq: remove sparse terms
    nb = NaiveBayes()
    lr = LogisticRegression(maxIter=100)

    customFilter = CustomFilter()
    # pipeline = Pipeline(stages=[stringIndexer, tokenizer, stopwordsRemover, hashTF, idf, nb])
    # pipeline = Pipeline(stages=[stringIndexer, tokenizer, stopwordsRemover, countVectorizer, idf, nb])
    pipeline = build_ngrams(3)

    data_train, data_val = data.randomSplit([0.8, 0.2])
    model = pipeline.fit(data_train)
    train_pred = model.transform(data_train)
    val_pred = model.transform(data_val)
    # val_acc = val_pred.filter(val_pred.label==val_pred.prediction).count() / float(data_val.count())

    val_pred.show()
Beispiel #8
0
#########################################################################################

#Stop words and hashing

from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF

# Remove stop words.
wrangled = StopWordsRemover(inputCol='words', outputCol='terms')\
      .transform(sms)

# Apply the hashing trick
wrangled = HashingTF(inputCol='terms', outputCol='hash', numFeatures=1024)\
      .transform(wrangled)

# Convert hashed symbols to TF-IDF
tf_idf = IDF(inputCol='hash', outputCol='features')\
      .fit(wrangled).transform(wrangled)

tf_idf.select('terms', 'features').show(4, truncate=False)

#########################################################################################

# Training a spam classifier

# Split the data into training and testing sets
sms_train, sms_test = sms.randomSplit([0.8, 0.2], seed=13)

# Fit a Logistic Regression model to the training data
logistic = LogisticRegression(regParam=0.2).fit(sms_train)

# Make predictions on the testing data
prediction = logistic.transform(sms_test)
Beispiel #9
0
def main(input_dir, output_dir):
    # main logic starts here
    df_schema = types.StructType([
        types.StructField('title_clean', types.StringType()),
        types.StructField('polarity_subjectivity',
                          types.ArrayType(types.FloatType())),
        types.StructField('score', types.LongType()),
        types.StructField('num_comments', types.LongType()),
    ])

    headlines_df = spark.read.json(input_dir,
                                   encoding='utf-8',
                                   schema=df_schema).repartition(80)
    split_sentiment_df = headlines_df.withColumn(
        'polarity',
        functions.element_at(headlines_df['polarity_subjectivity'],
                             1)).withColumn(
                                 'subjectivity',
                                 functions.element_at(
                                     headlines_df['polarity_subjectivity'], 2))

    df_sentiment = split_sentiment_df.withColumn(
        'label', get_label(split_sentiment_df['polarity']))

    training_set, validation_set = df_sentiment.randomSplit([0.75, 0.25])

    headline_vector_size = 3
    word_freq_vector_size = 100

    tokenizer = Tokenizer(inputCol='title_clean', outputCol='words')
    headline2Vector = Word2Vec(vectorSize=headline_vector_size,
                               minCount=0,
                               inputCol='words',
                               outputCol='headline_vector')
    hashingTF = HashingTF(inputCol='words',
                          outputCol='word_counts',
                          numFeatures=word_freq_vector_size)
    idf = IDF(inputCol='word_counts', outputCol='word_frequecy', minDocFreq=5)
    headline_vector_size_hint = VectorSizeHint(
        inputCol='headline_vector',
        size=headline_vector_size)  #need this for streaming
    word_freq_vector_size_hint = VectorSizeHint(
        inputCol='word_frequecy',
        size=word_freq_vector_size)  #need this for streaming
    feature_assembler = VectorAssembler(inputCols=[
        'headline_vector', 'score', 'num_comments', 'subjectivity',
        'word_frequecy'
    ],
                                        outputCol='features')
    dt_classifier = DecisionTreeClassifier(featuresCol='features',
                                           labelCol='label',
                                           predictionCol='prediction',
                                           maxDepth=9)

    pipeline = Pipeline(stages=[
        tokenizer, headline2Vector, hashingTF, idf, headline_vector_size_hint,
        word_freq_vector_size_hint, feature_assembler, dt_classifier
    ])
    sentiment_model = pipeline.fit(training_set)

    validation_predictions = sentiment_model.transform(validation_set)

    evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',
                                                  labelCol='label')
    validation_score = evaluator.evaluate(validation_predictions)
    print('Validation score for Sentiment model F1: %g' % (validation_score, ))

    validation_score_accuracy = evaluator.evaluate(
        validation_predictions, {evaluator.metricName: "accuracy"})
    print('Validation score for Sentiment model Accuracy: %g' %
          (validation_score_accuracy, ))

    sentiment_model.write().overwrite().save(output_dir)
Beispiel #10
0
[Row(id=1, sentence=u'This is an introduction to Spark MLlib')]
>>> sent_tokenized_df.take(1)
[Row(id=1, sentence=u'This is an introduction to Spark MLlib', words=[u'this', u'is', u'an', u'introduction', u'to', u'spark', u'mllib'])]

>>> hashingTF = HashingTF(inputCol="words",outputCol="rawFeatures",numFeatures=20)
>>> sent_hfTF_df = hashingTF.transform(sent_tokenized_df)
>>> sent_hfTF_df.show(10,False)
# +---+----------------------------------------------------------+------------------------------------------------------------------+------------------------------------------------------+
# |id |sentence                                                  |words                                                             |rawFeatures                                           |
# +---+----------------------------------------------------------+------------------------------------------------------------------+------------------------------------------------------+
# |1  |This is an introduction to Spark MLlib                    |[this, is, an, introduction, to, spark, mllib]                    |(20,[1,5,6,8,12,13],[2.0,1.0,1.0,1.0,1.0,1.0])        |
# |2  |MLlib includes libraries for classification and regression|[mllib, includes, libraries, for, classification, and, regression]|(20,[1,6,9,12,13,15,16],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
# |3  |It also contains supporting tools for pipelines           |[it, also, contains, supporting, tools, for, pipelines]           |(20,[0,8,10,12,15,16],[1.0,1.0,1.0,1.0,1.0,2.0])      |
# +---+----------------------------------------------------------+------------------------------------------------------------------+------------------------------------------------------+

>>> sent_hfTF_df.take(1)
[Row(id=1, sentence=u'This is an introduction to Spark MLlib', words=[u'this', u'is', u'an', u'introduction', u'to', u'spark', u'mllib'], rawFeatures=SparseVector(20, {1: 2.0, 5: 1.0, 6: 1.0, 8: 1.0, 12: 1.0, 13: 1.0}))]
>>> idf = IDF(inputCol='rawFeatures',outputCol='idf_features')
>>> idfModel = idf.fit(sent_hfTF_df)
>>> tfidf_df = idfModel.transform(sent_hfTF_df)
>>> tfidf_df.show(10,False)
# +---+----------------------------------------------------------+------------------------------------------------------------------+------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
# |id |sentence                                                  |words                                                             |rawFeatures                                           |idf_features                                                                                                                                         |
# +---+----------------------------------------------------------+------------------------------------------------------------------+------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
# |1  |This is an introduction to Spark MLlib                    |[this, is, an, introduction, to, spark, mllib]                    |(20,[1,5,6,8,12,13],[2.0,1.0,1.0,1.0,1.0,1.0])        |(20,[1,5,6,8,12,13],[0.5753641449035617,0.6931471805599453,0.28768207245178085,0.28768207245178085,0.0,0.28768207245178085])                         |
# |2  |MLlib includes libraries for classification and regression|[mllib, includes, libraries, for, classification, and, regression]|(20,[1,6,9,12,13,15,16],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])|(20,[1,6,9,12,13,15,16],[0.28768207245178085,0.28768207245178085,0.6931471805599453,0.0,0.28768207245178085,0.28768207245178085,0.28768207245178085])|
# |3  |It also contains supporting tools for pipelines           |[it, also, contains, supporting, tools, for, pipelines]           |(20,[0,8,10,12,15,16],[1.0,1.0,1.0,1.0,1.0,2.0])      |(20,[0,8,10,12,15,16],[0.6931471805599453,0.28768207245178085,0.6931471805599453,0.0,0.28768207245178085,0.5753641449035617])                        |
# +---+----------------------------------------------------------+------------------------------------------------------------------+------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+

>>> tfidf_df.take(1)
# [Row(id=1, sentence=u'This is an introduction to Spark MLlib', words=[u'this', u'is', u'an', u'introduction', u'to', u'spark', u'mllib'], rawFeatures=SparseVector(20, {1: 2.0, 5: 1.0, 6: 1.0, 8: 1.0, 12: 1.0, 13: 1.0}), idf_features=SparseVector(20, {1: 0.5754, 5: 0.6931, 6: 0.2877, 8: 0.2877, 12: 0.0, 13: 0.2877}))]
def main(argv=None):
    if argv is None:
        inputs_train = sys.argv[1]
        inputs_test = sys.argv[2]

    conf = SparkConf().setAppName('sentiment-analysis-tfidf')
    sc = SparkContext(conf=conf)
    sqlCt = SQLContext(sc)

    #read train json file and prepare data (label, feature)
    text = sqlCt.read.json(inputs_train)
    train = text.select('overall',
                        'reviewText').withColumnRenamed('overall', 'label')
    train.cache()

    ## DATA PROCESSING PIPELINE
    # Split at whitespace and characters that are not letter
    tokenizer = RegexTokenizer(inputCol="reviewText",
                               outputCol="words",
                               pattern="\\P{Alpha}+")

    # stopword remover
    remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")

    pipeline_data_processing = Pipeline(stages=[tokenizer, remover])
    model_data_processing = pipeline_data_processing.fit(train)
    train_processed = model_data_processing.transform(train)
    train.unpersist()
    train_processed.cache()

    ## ML PIPELINE
    # TF-IDF Features
    hashingTF = HashingTF(inputCol="filtered_words",
                          outputCol="rawFeatures",
                          numFeatures=1000)
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    # linear Regression Model
    lr = LinearRegression(maxIter=20, regParam=0.1)
    # Final Pipeline
    pipeline = Pipeline(stages=[hashingTF, idf, lr])

    # FIT MODEL USING CROSS VALIDATION
    # Parameter grid for cross validation: numFeatures and regParam
    paramGrid = ParamGridBuilder() \
        .addGrid(hashingTF.numFeatures, [5000, 10000, 20000, 50000]) \
        .addGrid(lr.regParam, [0.001, 0.01, 0.1, 1.0]) \
        .build()

    # 5-fold cross validation
    evaluator = RegressionEvaluator(metricName="rmse")
    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=evaluator,
                              numFolds=5)

    # Run cross-validation, and choose the best set of parameters.
    model = crossval.fit(train_processed)

    # RMSE on train data
    prediction_train = model.transform(train_processed)
    rmse_train = evaluator.evaluate(prediction_train)
    train_processed.unpersist()

    ## EVALUATION ON TEST DATA
    #read test json file and prepare data (label, feature)
    text = sqlCt.read.json(inputs_test)
    test = text.select('overall',
                       'reviewText').withColumnRenamed('overall', 'label')
    test_processed = model_data_processing.transform(test)

    # Evaluate the model on test data
    prediction_test = model.transform(test_processed)
    rmse_test = evaluator.evaluate(prediction_test)

    # Print Result
    result = "MODEL WITH TF_IDF - best no. features = " \
          + str(model.bestModel.stages[0].getNumFeatures()) + ":\n"
    result = result + "-Train RMSE: " + str(rmse_train) + "\n"
    result = result + "-Test RMSE: " + str(rmse_test) + "\n"
    print(result)
Beispiel #12
0
    def lda_optimal(self,
                    preprocess_file=DEFAULT_PREPROCESSING_OUTPUT,
                    cluster_df=CLUSTER_DF,
                    maxiter=MAXITER,
                    output_file_name=DEFAULT_OUTPUT_FILE,
                    max_term_tagging=m):

        filter_number_udf = udf(
            lambda row: [x for x in row if not self.is_digit(x)],
            ArrayType(StringType()))
        temp = sqlContext.read.parquet(preprocess_file)
        temp = temp.withColumn('no_number_vector_removed',
                               filter_number_udf(col('vector_no_stopw')))
        temp1 = temp.select(temp.paper_id,
                            explode(temp.no_number_vector_removed))
        temp2 = temp1.filter(temp1.col != "")
        temp3 = temp2.groupby("paper_id").agg(
            F.collect_list("col").alias("vector_removed"))
        inner_join = temp3.join(temp, ["paper_id"])
        windowSpec = Window.orderBy(F.col("paper_id"))
        df_final = inner_join.withColumn("id", F.row_number().over(windowSpec))
        df_txts = df_final.select("vector_removed", "id", "paper_id", "doi",
                                  "title", "authors", "abstract",
                                  "abstract_summary", "vector_no_stopw")
        df = sqlContext.read.format("com.databricks.spark.csv").option(
            "header",
            "true").option("inferschema",
                           "true").option("mode",
                                          "DROPMALFORMED").load("CLUSTER_DF")
        df_txts = df.join(df_txts, "paper_id" == "index")

        # TF
        cv = CountVectorizer(inputCol="vector_removed",
                             outputCol="raw_features",
                             vocabSize=5000,
                             minDF=5.0)
        cvmodel = cv.fit(df_txts)
        result_cv = cvmodel.transform(df_txts)
        # IDF
        idf = IDF(inputCol="raw_features", outputCol="features")
        idfModel = idf.fit(result_cv)
        result_tfidf = idfModel.transform(result_cv)

        from pyspark.sql import SparkSession
        from pyspark.sql.types import StructType, StructField, StringType

        spark = SparkSession.builder.appName(
            'SparkByExamples.com').getOrCreate()

        schema = StructType([
            StructField('cluster_id', StringType(), True),
            StructField('tagging', ArrayType(), True)
        ])

        topic_modeling = spark.createDataFrame(spark.sparkContext.emptyRDD(),
                                               schema)

        distinct_clusters = result_tfidf.select(
            "cluster_id").distinct().sorted().collect_list()
        for i in distinct_clusters:
            subset = result_tfidf.filter(result_tfidf.cluster_id == i)
            lda = LDA(k=1, maxIter=100)
            ldaModel = lda.fit(result_subset)
            output = ldaModel.transform(result_tfidf)
            if (i == 0):
                full_df = output
            else:
                full_df = full_df.union(output)
            topics = ldaModel.describeTopics(maxTermsPerTopic=m)
            vocabArray = cvmodel.vocabulary
            ListOfIndexToWords = udf(
                lambda wl: list([vocabArray[w] for w in wl]))
            FormatNumbers = udf(lambda nl: ["{:1.4f}".format(x) for x in nl])

            taggings = topics.select(
                ListOfIndexToWords(topics.termIndices).alias('words'))
            temp = spark.createDataFrame([(i, taggings)],
                                         ['cluster_id', 'taggings'])
            topic_modeling = topic_modeling.union(temp)

        # output the taggings of each topic
        topic_modeling.to_csv(output_file_name)

        return full_df
                                                      ' '))

# Text to tokens
tokenizer = Tokenizer(inputCol="text", outputCol="words")
tokenized = tokenizer.transform(wrangled)

# Remove stop words.
remover = StopWordsRemover(inputCol="words", outputCol="terms")
removed = remover.transform(tokenized)

# Apply the hashing trick
hasher = HashingTF(inputCol="terms", outputCol="hash", numFeatures=1024)
hashed = hasher.transform(removed)

# Convert hashed symbols to TF-IDF
idf = IDF(inputCol="hash", outputCol="features")
sms = idf.fit(hashed).transform(hashed)

# View the first four records
sms.show(4, truncate=False)

# Split the data into training and testing sets
sms_train, sms_test = sms.randomSplit([0.8, 0.2], seed=13)

# Fit a Logistic Regression model to the training data
logistic = LogisticRegression(regParam=0.2)
logistic = logistic.fit(sms_train)

# Make predictions on the testing data
prediction = logistic.transform(sms_test)
Beispiel #14
0
################# Tokenize the data
pre_process = udf(
    lambda x: re.sub(r'[^A-Za-z\n ]|(http\S+)|(www.\S+)', '', \
        x.lower().strip()).split(), ArrayType(StringType())
    )
df = df.withColumn("cleaned_data", pre_process(df.message)).dropna()

################# Split the dataframe into training and testing
train, test = df.randomSplit([0.8, 0.2], seed=100)

################# Create an ML Pipeline
# Peforms TF-IDF calculation and Logistic Regression
remover = StopWordsRemover(inputCol="cleaned_data", outputCol="words")
vector_tf = CountVectorizer(inputCol="words", outputCol="tf")
idf = IDF(inputCol="tf", outputCol="features", minDocFreq=3)
label_indexer = StringIndexer(inputCol="sentiment", outputCol="label")
logistic_regression = LogisticRegression(maxIter=100)

pipeline = Pipeline(
    stages=[remover, vector_tf, idf, label_indexer, logistic_regression])

################# Fit the pipeline to the training dataframe
trained_model = pipeline.fit(train)
'''
The labels are labelled with positive (4) as 0.0 
negative (0) as 1.0
'''
################# Predicting the test dataframe and calculating accuracy
prediction_df = trained_model.transform(test)
Beispiel #15
0
def main():
    spark = SparkSession.builder.appName('AmazonReviewsSparkProcessor').getOrCreate()
    
    # Convert command line args into a map of args
    args_iter = iter(sys.argv[1:])
    args = dict(zip(args_iter, args_iter))
    
    # Retrieve the args and replace 's3://' with 's3a://' (used by Spark)
    s3_input_data = args['s3_input_data'].replace('s3://', 's3a://')
    print(s3_input_data)
    
    s3_output_data = args['s3_output_data'].replace('s3://', 's3a://')
    print(s3_output_data)
    
    schema = StructType([
        StructField('is_positive_sentiment', IntegerType(), True),
        StructField('marketplace', StringType(), True),
        StructField('customer_id', StringType(), True),
        StructField('review_id', StringType(), True),
        StructField('product_id', StringType(), True),
        StructField('product_parent', StringType(), True),
        StructField('product_title', StringType(), True),
        StructField('product_category', StringType(), True),
        StructField('star_rating', IntegerType(), True),
        StructField('helpful_votes', IntegerType(), True),
        StructField('total_votes', IntegerType(), True),
        StructField('vine', StringType(), True),
        StructField('verified_purchase', StringType(), True),
        StructField('review_headline', StringType(), True),
        StructField('review_body', StringType(), True),
        StructField('review_date', StringType(), True)
    ])
    
    df_csv = spark.read.csv(path=s3_input_data,
                            schema=schema,
                            header=True,
                            quote=None)
    df_csv.show()

    # This dataset should already be clean, but always good to double-check
    print('Showing null review_body rows...')
    df_csv.where(col('review_body').isNull()).show()

    df_csv_cleaned = df_csv.na.drop(subset=['review_body'])
    df_csv_cleaned.where(col('review_body').isNull()).show()
   
    tokenizer = Tokenizer(inputCol='review_body', outputCol='words')
    wordsData = tokenizer.transform(df_csv_cleaned)
    
    hashingTF = HashingTF(inputCol='words', outputCol='raw_features', numFeatures=1000)
    featurizedData = hashingTF.transform(wordsData)
    
    # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:
    # 1) compute the IDF vector 
    # 2) scale the term frequencies by IDF
    # Therefore, we cache the result of the HashingTF transformation above to speed up the 2nd pass
    featurizedData.cache()

    # spark.mllib's IDF implementation provides an option for ignoring terms
    # which occur in less than a minimum number of documents.
    # In such cases, the IDF for these terms is set to 0.
    # This feature can be used by passing the minDocFreq value to the IDF constructor.
    idf = IDF(inputCol='raw_features', outputCol='features') #, minDocFreq=2)
    idfModel = idf.fit(featurizedData)
    features_df = idfModel.transform(featurizedData)
    features_df.select('is_positive_sentiment', 'features').show()

    # TODO:  Use SVD instead
    # features_vector_rdd = features_df.select('features').rdd.map( lambda row: Vectors.fromML(row.getAs[MLVector]('features') )
    # features_vector_rdd.cache()
    # mat = RowMatrix(features_vector_rdd)
    # k = 300
    # svd = mat.computeSVD(k, computeU=True)
    # TODO:  Reconstruct

    num_features=300
    pca = PCA(k=num_features, inputCol='features', outputCol='pca_features')
    pca_model = pca.fit(features_df)
    pca_features_df = pca_model.transform(features_df).select('is_positive_sentiment', 'pca_features')
    pca_features_df.show(truncate=False)

    standard_scaler = StandardScaler(inputCol='pca_features', outputCol='scaled_pca_features')
    standard_scaler_model = standard_scaler.fit(pca_features_df)
    standard_scaler_features_df = standard_scaler_model.transform(pca_features_df).select('is_positive_sentiment', 'scaled_pca_features')
    standard_scaler_features_df.show(truncate=False)

    expanded_features_df = (standard_scaler_features_df.withColumn('f', to_array(col('scaled_pca_features')))
        .select(['is_positive_sentiment'] + [col('f')[i] for i in range(num_features)]))
    expanded_features_df.show()

    # Remover overwrite to test for this issue
    #    https://stackoverflow.com/questions/51050591/spark-throws-java-io-ioexception-failed-to-rename-when-saving-part-xxxxx-gz
    expanded_features_df.write.csv(path=s3_output_data,
                       header=None,
                       quote=None) #,
#                       mode='overwrite')

    print('Wrote to output file:  {}'.format(s3_output_data))
Beispiel #16
0
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer,HashingTF,IDF
from pyspark.ml.classification import NaiveBayes
from pyspark.sql.functions import udf,explode,size
sns.set()
spark = SparkSession.builder.appName("TU-1").getOrCreate()

train = fetch_20newsgroups(subset='train')
test = fetch_20newsgroups(subset='test')

p_train=pd.DataFrame({'data':train.data,'target':train.target,'filenames':train.filenames})
p_test=pd.DataFrame({'data':test.data,'target':test.target,'filenames':test.filenames})

s_train = spark.createDataFrame(p_train)
s_test = spark.createDataFrame(p_test)

tokenizer = RegexTokenizer(inputCol='data',outputCol='words',pattern='\\W')
termFreq = HashingTF(inputCol='words',outputCol='freq')
idf = IDF(inputCol='freq',outputCol='tfidf')
nb = NaiveBayes(featuresCol="tfidf", labelCol="target")
pipeline = Pipeline(stages=[tokenizer,termFreq,idf,nb])
model = pipeline.fit(s_train)
data = model.transform(s_test)
p_data = data.sample(False,0.5).limit(500).toPandas()

mat = confusion_matrix(p_data.target,p_data.prediction)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=train.target_names, yticklabels=train.target_names)
  .setOutputCol("filtered")

from pyspark.ml.feature import CountVectorizer

# we will remove words that appear in 5 docs or less
cv = CountVectorizer(minTF=1., minDF=5., vocabSize=2**17)\
  .setInputCol("filtered")\
  .setOutputCol("tf")

# we now create a pipelined transformer
cv_pipeline = Pipeline(stages=[tokenizer, sw_filter, cv]).fit(review)
cv_pipeline.transform(review).show(5)

from pyspark.ml.feature import IDF
idf = IDF().\
    setInputCol('tf').\
    setOutputCol('tfidf')

idf_pipeline = Pipeline(stages=[cv_pipeline, idf]).fit(review)

tfidf_df = idf_pipeline.transform(review)

tfidf_df.show(10)
#training_df, validation_df, testing_df = review.randomSplit([0.6, 0.3, 0.1], seed=0)

#training_df, validation_df, testing_df = review.randomSplit([0.6, 0.3, 0.1], seed=0)
#[training_df.count(), validation_df.count(), testing_df.count()]

import pandas as pd

training_df, validation_df, testing_df = review.randomSplit([0.6, 0.3, 0.1],
smsDf = sqlContext.createDataFrame(smsXformed, ["label", "message"])
smsDf.cache()
smsDf.select("label", "message").show()

#Split training and testing
(trainingData, testData) = smsDf.randomSplit([0.9, 0.1])
trainingData.count()
testData.count()
testData.collect()

#Setup pipeline
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.feature import IDF

tokenizer = Tokenizer(inputCol="message", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), \
        outputCol="tempfeatures")
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
nbClassifier = NaiveBayes()

pipeline = Pipeline(stages=[tokenizer, hashingTF, \
                idf, nbClassifier])

nbModel = pipeline.fit(trainingData)

prediction = nbModel.transform(testData)
prediction.groupBy("label", "prediction").count().show()
Beispiel #19
0
tfIdfIn = tokenized\
  .where("array_contains(DescOut, 'red')")\
  .select("DescOut")\
  .limit(10)
tfIdfIn.show(10, False)

# COMMAND ----------

from pyspark.ml.feature import HashingTF, IDF
tf = HashingTF()\
  .setInputCol("DescOut")\
  .setOutputCol("TFOut")\
  .setNumFeatures(10000)
idf = IDF()\
  .setInputCol("TFOut")\
  .setOutputCol("IDFOut")\
  .setMinDocFreq(2)

# COMMAND ----------

idf.fit(tf.transform(tfIdfIn)).transform(tf.transform(tfIdfIn)).show(10, False)

# COMMAND ----------

from pyspark.ml.feature import Word2Vec
# Input data: Each row is a bag of words from a sentence or document.
documentDF = spark.createDataFrame(
    [("Hi I heard about Spark".split(" "), ),
     ("I wish Java could use case classes".split(" "), ),
     ("Logistic regression models are neat".split(" "), )], ["text"])
# Learn a mapping from words to Vectors.
Beispiel #20
0
                       ,StructField("id", StringType(), True)\
                       ,StructField("date", StringType(), True)\
                       ,StructField("flag", StringType(), True)\
                       ,StructField("user", StringType(), True)\
                       ,StructField("body", StringType(), True)])

df = spark.createDataFrame(data, schema=mySchema)
df.show(5)

# Create training, validation, and test sets
(train_set, val_set, test_set) = df.randomSplit([0.98, 0.01, 0.01], seed=2000)

# Prepare TF-IDF + Logistic Regression Model
tokenizer = Tokenizer(inputCol="body", outputCol="words")
hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features",
          minDocFreq=5)  #minDocFreq: remove sparse terms
label_stringIdx = StringIndexer(inputCol="target", outputCol="label")
pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx])

pipelineFit = pipeline.fit(train_set)
train_df = pipelineFit.transform(train_set)
val_df = pipelineFit.transform(val_set)
train_df.show(5)

# Train Model
lr = LogisticRegression(maxIter=20)
lrModel = lr.fit(train_df)
predictions = lrModel.transform(val_df)

# Evaluate Model
from pyspark.ml.evaluation import BinaryClassificationEvaluator
Beispiel #21
0
plt.figure(figsize=(10, 7))
sn.heatmap(df_cm, annot=True)
plt.savefig('myfig_1.png')

print("\n")
print("=" * 40)
print("Running Logistic Regression using TF-IDF Features. Please wait.")
start = time.time()

# Maps a sequence of terms to their term frequencies using the hashing trick.
hashingTF = HashingTF(inputCol="filtered",
                      outputCol="rawFeatures",
                      numFeatures=10000)
# Compute the Inverse Document Frequency (IDF) given a collection of documents.
# minDocFreq: Minimum number of documents in which a term should appear for filtering'
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)

pipeline = Pipeline(
    stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])
pipelineFit = pipeline.fit(df)
dataset = pipelineFit.transform(df)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100)
lr = LogisticRegression(maxIter=20, regParam=0.1, elasticNetParam=0.2)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
end = time.time()
print("Accuracy:\t\t" +
      str(evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})))
print("Weighted Precision:\t" + str(
Beispiel #22
0
print("countvectorize")
df = CountVectorizer(inputCol="words",
                     outputCol="countVector",
                     vocabSize=2000,
                     minDF=8.0)\
                     .fit(df)\
                     .transform(df)\
                     .select("countVector","overall")

df.show(truncate=False)

###

from pyspark.ml.feature import IDF
print("tfidf")
df = IDF(inputCol="countVector",
         outputCol="tfidf").fit(df).transform(df).select("tfidf", "overall")
df.show()

###

from pyspark.ml.feature import PCA
print("pca")
df = PCA(k=300, inputCol="tfidf",
         outputCol="pca").fit(df).transform(df).select("pca", "overall")
df.show()
#df.show(truncate=False)

###

from pyspark.ml.regression import RandomForestRegressor
    return temp_dict


## Converting RDD to DataFrame

df = processed_papers.map(lambda record: Row(**row_conversion(record))).toDF()
#df.printSchema()

## Featurizing processed text into TF-IDF vectors
cv = sparkCountVectorizer(inputCol='body_text', outputCol='tf_vector')
cv_model = cv.fit(df)
tf_df = cv_model.transform(
    df)  ## New column tf_vector with respective term-frequency vectors

## Standardizing TF vectors into TF-IDF vectors
idf = IDF(inputCol='tf_vector', outputCol='tfidf_vector')
idf_model = idf.fit(tf_df)
tfidf_df = idf_model.transform(
    tf_df)  ## New column tfidf_vector with respective TF-IDF vectors


## Helper function to convert sparse vector to dense vector
def sparse_to_dense(v):
    v = DenseVector(v)
    dense_vector = list([float(x) for x in v])
    return dense_vector


## Converting back to RDD
papers_rdd = tfidf_df.select('paper_id', 'tfidf_vector').rdd.map(
    lambda t: (t['paper_id'], sparse_to_dense(t['tfidf_vector'])))
Beispiel #24
0
def main():
    spark = SparkSession.builder \
        .appName("Spark CV-job ad matching") \
        .config("spark.some.config.option", "some-value") \
        .master("local[*]") \
        .getOrCreate()

    NUM_FEATURES = 2**8

    df_jobs = spark.read.json("alljobs4rdd/alljobs.jsonl").filter("description is not NULL").cache()
    df_jobs.registerTempTable("jobs")
    df_cvs = spark.read.json("allcvs4rdd/allcvs.jsonl").cache()
    df_cvs.registerTempTable("cvs")
    df_categories = spark.read.json("allcategories4rdd/allcategories.jsonl").cache()
    df_categories.registerTempTable("categories")

    joined = spark.sql("SELECT description AS text, jobId AS id, 'job' AS type FROM jobs UNION ALL \
               SELECT description AS text, cvid AS id, 'cv' AS type FROM cvs UNION ALL \
               SELECT skillText AS text, id AS id, 'categories' AS type FROM categories")

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    tokenized = tokenizer.transform(joined)

    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    removed = remover.transform(tokenized)

    hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=NUM_FEATURES)
    featurizedData = hashingTF.transform(removed)

    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    rescaledData.registerTempTable("resultTable")
    jobs = spark.sql("SELECT features, id AS jobId FROM resultTable WHERE type = 'job'")
    cvs = spark.sql("SELECT features AS featuresCV, id AS cvid FROM resultTable WHERE type = 'cv'")
    categories = spark.sql("SELECT features AS featuresCAT, cat.id, cat.skillName AS skillName, category FROM resultTable AS rt\
    LEFT JOIN categories AS cat ON rt.id = cat.id WHERE type = 'categories'")

    #Calculate job-cv similarity START
    crossJoined = jobs.select("jobId", "features").crossJoin(cvs.select("cvid", "featuresCV"))
    calculatedDF = crossJoined.rdd.map(lambda x: (x.jobId, x.cvid, calculate_distance(x.features, x.featuresCV)))\
    .toDF(["jobid", "cvid", "distance"])
    ordered = calculatedDF.orderBy(asc("jobid")).coalesce(2)
    ordered.write.csv('Calculated/tfidf/job-cv')
    #Calculate job-cv similarity END

    #Calculate cv-category similarity START
    crossJoined_cat_cv = cvs.select("cvid", "featuresCV").crossJoin(categories.select("id", "skillName", "featuresCAT", "category"))
    calculatedDF_cat_cv = crossJoined_cat_cv.rdd\
    .map(lambda x: (x.cvid, x.id, x.skillName, x.category, calculate_distance(x.featuresCV, x.featuresCAT)))\
    .toDF(["cvid", "catid", "skillName", "category", "distance"])
    ordered_cat_cv = calculatedDF_cat_cv.orderBy(asc("cvid"), asc("distance")).coalesce(2)
    ordered_cat_cv.write.csv('Calculated/tfidf/cv-category')
    #Calculate cv-category similarity END

    #Job-category START
    crossJoined_job_cat = jobs.select("jobId", "features").crossJoin(categories.select("id", "skillName", "featuresCAT", "category"))
    calculatedDF_job_cat = crossJoined_job_cat.rdd\
    .map(lambda x: (x.jobId, x.id, x.skillName, x.category, calculate_distance(x.features, x.featuresCAT)))\
    .toDF(["jobid", "catid", "skillName", "category", "distance"])
    ordered_job_cat = calculatedDF_job_cat.orderBy( asc("distance")).coalesce(2)
    ordered_job_cat.write.csv('Calculated/tfidf/job-category')
Beispiel #25
0
swr = StopWordsRemover(inputCol='text_token', outputCol='text_sw_removed')
reviews_swr = swr.transform(reviews_token)
reviews_swr.show(3)

# In[9]:
# Word Term Frequency
from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer(inputCol="text_sw_removed", outputCol="tf")
cv_model = cv.fit(reviews_swr)
reviews_cv = cv_model.transform(reviews_swr)
reviews_cv.show(3)

# In[10]:
# TF-IDF
from pyspark.ml.feature import IDF
idf = IDF(inputCol="tf", outputCol="features")
idf_model = idf.fit(reviews_cv)
reviews_tfidf = idf_model.transform(reviews_cv)
reviews_tfidf.show(3)

# In[11]:
# Predict Rating Score (Repeat What we did in Lecture 10)
gradings = reviews_tfidf.select('funny', 'cool', 'useful', 'stars').toPandas()
sns.distplot(gradings['funny'])
sns.distplot(gradings['cool'])
sns.distplot(gradings['useful'])
sns.distplot(gradings['stars'])

from pyspark.ml.feature import StringIndexer
stringIdx = StringIndexer(inputCol="stars", outputCol="label")
final = stringIdx.fit(reviews_tfidf).transform(reviews_tfidf)
Beispiel #26
0
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

sentenceData = spark.createDataFrame(
    [(0.0, "Hi I heard about Spark"),
     (0.0, "I wish Java could use case classes"),
     (1.0, "Logistic regression models are neat")], ["label", "sentence"])

sentenceData.show()

tokenizer = Tokenizer(inputCol='sentence', outputCol='words')
words_data = tokenizer.transform(sentenceData)
words_data.show(truncate=False)

hashing_tf = HashingTF(inputCol='words', outputCol='rawFeatures')
featurized_data = hashing_tf.tranfsorm(words_data)
idf = IDF(inputCol='rawFeatures', outputCol='features')
idf_model = idf.fit(featurized_data)
rescaled_data = idf_model.transform(featurized_data)
rescaled_data.select('label', 'features').show(truncate=False)

from pyspark.ml.feature import CountVectorizer

df = spark.createDataFrame([(0, "a,b,c".split(" ")),
                            (1, "a b b c a".split(" "))], ["id", "words"])

df.show()

cv = CountVectorizer(inputCol='words',
                     outputCol='features',
                     vocabSize=3,
                     minDF=2.0)
                           pattern="[a-zA-Z]+")

## Remove ignored words
stopWordsRemover = StopWordsRemover(
    inputCol=tokenizer.getOutputCol(),
    outputCol="filtered",
    stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"],
    caseSensitive=False)

## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="wordToIndex",
                      numFeatures=1 << 10)

## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4)

if algo == "gbm":
    ## Create GBM model
    algoStage = H2OGBM(ratio=0.8,
                       seed=1,
                       featuresCols=[idf.getOutputCol()],
                       predictionCol="label")
elif algo == "dl":
    ## Create H2ODeepLearning model
    algoStage = H2ODeepLearning(epochs=10,
                                seed=1,
                                l1=0.001,
                                l2=0.0,
                                hidden=[200, 200],
                                featuresCols=[idf.getOutputCol()],
Beispiel #28
0
# Make predictions on the testing data
predictions = pipeline.transform(flights_test)

--------------------------------------------------
# Exercise_3 
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF

# Break text into tokens at non-word characters
tokenizer = Tokenizer(inputCol='text', outputCol='words')

# Remove stop words
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='terms')

# Apply the hashing trick and transform to TF-IDF
hasher = HashingTF(inputCol=remover.getOutputCol(), outputCol="hash")
idf = IDF(inputCol=hasher.getOutputCol(), outputCol="features")

# Create a logistic regression object and add everything to a pipeline
logistic = LogisticRegression()
pipeline = Pipeline(stages=[tokenizer, remover, hasher, idf, logistic])

--------------------------------------------------
# Exercise_4 
# Create an empty parameter grid
params = ParamGridBuilder().build()

# Create objects for building and evaluating a regression model
regression = LinearRegression(labelCol='duration')
evaluator = RegressionEvaluator(labelCol='duration')

# Create a cross validator
Beispiel #29
0
def main(sc, sqlContext):

    #start = timer()

    #print '---Pegando usuario, posts, tokens e categorias do MongoDB---'
    #start_i = timer()
    user = findUserById(iduser)
    posts = findPosts(user)

    tokens, category, categoryAndSubcategory = getTokensAndCategories()
    postsRDD = (sc.parallelize(posts).map(lambda s: (s[
        0], word_tokenize(s[1].lower()), s[2], s[3])).map(lambda p: (p[
            0], [x for x in p[1] if x in tokens], p[2], p[3])).cache())

    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Pegando produtos do MongoDB---'
    #start_i = timer()

    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Criando corpusRDD---'
    #start_i = timer()
    stpwrds = stopwords.words('portuguese')
    corpusRDD = (postsRDD.map(lambda s: (s[0], [
        PorterStemmer().stem(x) for x in s[1] if x not in stpwrds
    ], s[2], s[3])).filter(lambda x: len(x[1]) >= 20 or
                           (x[2] == u'Post' and len(x[1]) > 0)).cache())
    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Calculando TF-IDF---'
    #start_i = timer()
    wordsData = corpusRDD.map(
        lambda s: Row(label=int(s[0]), words=s[1], type=s[2]))
    wordsDataDF = sqlContext.createDataFrame(wordsData).unionAll(
        sqlContext.read.parquet(
            "/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet"))

    numTokens = len(tokens)
    hashingTF = HashingTF(inputCol="words",
                          outputCol="rawFeatures",
                          numFeatures=numTokens)
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    featurizedData = hashingTF.transform(wordsDataDF)

    idfModel = idf.fit(featurizedData)
    tfIDF = idfModel.transform(featurizedData).cache()

    postTFIDF = (
        tfIDF.filter(tfIDF.type == u'Post')
        #.map(lambda s: Row(label=s[0], type=s[1], words=s[2], rawFeatures=s[3], features=s[4], sentiment=SVM.predict(s[4])))
        .cache())

    #postTFIDF = postTFIDF.filter(lambda p: p.sentiment == 1)
    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Carregando modelo---'
    #start_i = timer()
    NB = NaiveBayesModel.load(
        sc, '/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria')
    SVM = SVMModel.load(sc, "/home/ubuntu/recsys-tcc-ml/models/svm")
    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Usando o modelo---'
    #start_i = timer()
    predictions = (postTFIDF.map(lambda p: (NB.predict(p.features), p[
        0], SVM.predict(p.features))).filter(lambda p: p[2] == 1).map(
            lambda p: (p[0], p[1])).groupByKey().mapValues(list).collect())

    #print '####levou %d segundos' % (timer() - start_i)
    #print '---Calculando similaridades---'
    #start_i = timer()
    suggestions = []

    for prediction in predictions:
        category_to_use = category[int(prediction[0])]
        #print ' Calculando similaridades para a categoria: {}'.format(category_to_use)
        tf = tfIDF.filter(tfIDF.type == category_to_use).cache()
        for post in prediction[1]:
            postVector = postTFIDF.filter(
                postTFIDF.label == post).map(lambda x: x.features).collect()[0]
            sim = (tf.map(lambda x: (
                post, x.label, cossine(x.features, postVector))).filter(
                    lambda x: x[2] >= threshold).collect())
            if len(sim) > 0:
                suggestions.append(sim)

    #print '####levou %d segundos' % (timer() - start_i)

    if len(suggestions) > 0:
        #print '---Inserindo recomendacoes no MongoDB---'
        #start_i = timer()
        insertSuggestions(suggestions, iduser, posts)
print "Text is cleaned"

sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ['review', 'label'])
dfTrain, dfTest = df.randomSplit([0.8, 0.2])

print "Random split is done"

tokenizerNoSw = tr.NLTKWordPunctTokenizer(
    inputCol="review",
    outputCol="wordsNoSw",
    stopwords=set(nltk.corpus.stopwords.words('english')))
hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(),
                       outputCol='reviews_tf')
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
dt = LogisticRegression(featuresCol=idf.getOutputCol(),
                        labelCol=string_indexer.getOutputCol(),
                        maxIter=30,
                        regParam=0.01)

pipeline = Pipeline(
    stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt])

evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',
                                              labelCol='target_indexed',
                                              metricName='precision')

# grid=(ParamGridBuilder()
#      .baseOn([evaluator.metricName,'precision'])