Esempio n. 1
0
def vectorizeCV(DF):
    
    vectorizer = CountVectorizer()
    #cv = CountVectorizer(minDF=.0001, inputCol="raw", outputCol="features", binary=True)
    cv = CountVectorizer(minDF=1, inputCol="raw", outputCol="features", binary=True)
    
    model = cv.fit(DF)
    result = model.transform(DF)
    
    return result, model
def pre_processing(df):
    # fit a CountVectorizerModel from the corpus.
    cv = CountVectorizer(inputCol="words",
                         outputCol="features",
                         vocabSize=3,
                         minDF=2.0)

    model = cv.fit(df)

    result = model.transform(df)
    result.show(truncate=False)
Esempio n. 3
0
def functions_for_deal_with_texts_3(spark, resources_folder):
    df = spark.createDataFrame([(0, "a b c".split(" ")),
                                (1, "a b b c a".split(" "))], ["id", "words"])
    df.show()
    cv = CountVectorizer(inputCol='words',
                         outputCol='features',
                         vocabSize=3,
                         minDF=2.0)
    model = cv.fit(df)
    result = model.transform(df)
    result.show(truncate=False)
Esempio n. 4
0
def convertToVec(df, sc, ss, outputName, inputCol='tokens'):
    cv=CountVectorizer(inputCol=inputCol, outputCol='vectors',minTF=1.0)
    vecModel=cv.fit(df)
    print('\n\n\n Get Vocab... \n\n\n')
    inv_voc=vecModel.vocabulary 
    f = codecs.open(outputName+'_vocab.txt', encoding='utf-8', mode='w')
    for item in inv_voc:
        f.write(u'{0}\n'.format(item))
    f.close()
    vectors= vecModel.transform(df).select('id','subreddit','vectors')
    return vectors
Esempio n. 5
0
def sparsify(ngrams_df, model):
    if model is None:
        # TASK 6a: Binary CountVectorizer
        cv = CountVectorizer(minDF=10,
                             binary=True,
                             inputCol="split_ngrams",
                             outputCol="sparse_vector")
        model = cv.fit(ngrams_df)

    sparsified = model.transform(ngrams_df)
    return model, sparsified
Esempio n. 6
0
def main():
    for tn in tablenames:
        data = spark.read.format("org.apache.spark.sql.cassandra")\
                    .options(table=tn, keyspace=keyspace).load().limit(1000)

        data = data.sort('imdb_score', ascending=False)

        desc = data.rdd.map(lambda x: x['description']).filter(
            lambda x: x is not None)

        StopWords = nltk.corpus.stopwords.words('english')
        StopWords.extend([" ...                See full summary"])

        tokenized = desc.map( lambda y: y.strip().lower()).map( lambda x: re.split(" ", x))\
            .map( lambda word: [x for x in word if x.isalpha()]).map( lambda word: [x for x in word if len(x) > 3] )\
            .map( lambda word: [x for x in word if x not in StopWords]).zipWithIndex()

        df_txts = spark.createDataFrame(tokenized, ["words", 'index'])
        countVec = CountVectorizer(inputCol="words",
                                   outputCol="raw_features",
                                   vocabSize=5000,
                                   minDF=10.0)
        CountVectMod = countVec.fit(df_txts)
        result = CountVectMod.transform(df_txts)
        idf = IDF(inputCol="raw_features", outputCol="features")
        idfModel = idf.fit(result)
        resultTFIdf = idfModel.transform(result)

        totalTopics = 10
        totalItr = 100
        LDAModel = MLlibLDA.train(resultTFIdf.select('index','features').rdd.mapValues(MLlibVectors.fromML).map(list),\
                        k=totalTopics, maxIterations=totalItr)

        maxwordsTopic = 5
        topicIndices = sc.parallelize(
            LDAModel.describeTopics(maxTermsPerTopic=5))
        VCarr = CountVectMod.vocabulary

        def finalTopic(topic):
            terms = topic[0]
            result = []
            for i in range(maxwordsTopic):
                term = VCarr[terms[i]]
                result.append(term)
            return result

        topics_final = topicIndices.map(
            lambda topic: finalTopic(topic)).collect()
        print(topics_final)
        for topic in range(len(topics_final)):
            print("Topic" + str(topic) + ":")
            for term in topics_final[topic]:
                print(term)
            print('\n')
    def calTFIDF(self, dataset, colName):
        cv = CountVectorizer(inputCol=colName, outputCol="rawFeatures")
        cvmodel = cv.fit(dataset)
        featurizedData = cvmodel.transform(dataset)

        vocab = cvmodel.vocabulary
        vocab_broadcast = sparkTest.sparkContext.broadcast(vocab)

        idf = IDF(inputCol="rawFeatures", outputCol="features")
        idfModel = idf.fit(featurizedData)
        rescaledData = idfModel.transform(featurizedData)  # TFIDF
        return self.clusteredData(rescaledData, cvmodel)
Esempio n. 8
0
def task6a_create_feature_vector(labeled_comments, countVecModel=None):
    # TASK 6A
    # Code for task 6A...
    if countVecModel is None:
        cv = CountVectorizer(inputCol="ngrams",
                             outputCol="features",
                             minDF=10.0,
                             binary=True)
        my_countVecModel = cv.fit(labeled_comments)
    else:
        my_countVecModel = countVecModel
    result = my_countVecModel.transform(labeled_comments)
    return result, my_countVecModel
Esempio n. 9
0
def create_dictionary(rdd, dict_length):
    from pyspark.ml.feature import CountVectorizer
    df = rdd.toDF(['text', 'rating'])
    filled_df = df.na.fill(0)
    cv = CountVectorizer(inputCol="text",
                         outputCol="vectors",
                         vocabSize=dict_length)
    model_cv = cv.fit(filled_df)
    dictionary = {
        k: v + DICTIONARY_OFFSET
        for v, k in enumerate(model_cv.vocabulary)
    }
    return dictionary
Esempio n. 10
0
    def test_count_vectorizer_with_binary(self):
        dataset = self.spark.createDataFrame([
            (0, "a a a b b c".split(' '), SparseVector(3, {0: 1.0, 1: 1.0, 2: 1.0}),),
            (1, "a a".split(' '), SparseVector(3, {0: 1.0}),),
            (2, "a b".split(' '), SparseVector(3, {0: 1.0, 1: 1.0}),),
            (3, "c".split(' '), SparseVector(3, {2: 1.0}),)], ["id", "words", "expected"])
        cv = CountVectorizer(binary=True, inputCol="words", outputCol="features")
        model = cv.fit(dataset)

        transformedList = model.transform(dataset).select("features", "expected").collect()

        for r in transformedList:
            feature, expected = r
            self.assertEqual(feature, expected)
Esempio n. 11
0
def train(labeled_df):
    ''' train to get pos and neg models '''
    cv = CountVectorizer(inputCol="ngrams_combined",
                         binary=True,
                         outputCol="features",
                         minDF=10.0)
    cvModel = cv.fit(labeled_df)
    labeled_df = cvModel.transform(labeled_df)
    cvModel.save("cvModel")
    poslr = LogisticRegression(labelCol="poslabel",
                               featuresCol="features",
                               maxIter=10)
    neglr = LogisticRegression(labelCol="neglabel",
                               featuresCol="features",
                               maxIter=10)
    # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
    posEvaluator = BinaryClassificationEvaluator(labelCol="poslabel")
    negEvaluator = BinaryClassificationEvaluator(labelCol="neglabel")
    # There are a few parameters associated with logistic regression. We do not know what they are a priori.
    # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
    # We will assume the parameter is 1.0. Grid search takes forever.
    posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build()
    negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build()
    # We initialize a 5 fold cross-validation pipeline.
    posCrossval = CrossValidator(estimator=poslr,
                                 evaluator=posEvaluator,
                                 estimatorParamMaps=posParamGrid,
                                 numFolds=5)
    negCrossval = CrossValidator(estimator=neglr,
                                 evaluator=negEvaluator,
                                 estimatorParamMaps=negParamGrid,
                                 numFolds=5)
    # Although crossvalidation creates its own train/test sets for
    # tuning, we still need a labeled test set, because it is not
    # accessible from the crossvalidator (argh!)
    # Split the data 50/50
    pos = labeled_df
    neg = labeled_df
    posTrain, posTest = pos.randomSplit([0.5, 0.5])
    negTrain, negTest = neg.randomSplit([0.5, 0.5])
    # Train the models
    print("Training positive classifier...")
    posModel = posCrossval.fit(posTrain)
    print("Training negative classifier...")
    negModel = negCrossval.fit(negTrain)

    # Once we train the models, we don't want to do it again. We can save the models and load them again later.
    posModel.save("pos.model")
    negModel.save("neg.model")
    return cvModel, posModel, negModel
def count_vectorizer_usecase():
    spark = getSparkSession()
    df = spark.createDataFrame([(0, "a b".split(" ")),
                                (1, "a b b c a".split(" "))], ["id", "words"])
    """
        vocabSize=>指定字典的大小 
        minDF=>指定最少的文档数目
    """
    cv = CountVectorizer(inputCol="words", outputCol="features")

    model = cv.fit(df)

    result = model.transform(df)
    result.show(truncate=False)
Esempio n. 13
0
    def fit(self):
        sqlContext = SparkSession.builder.getOrCreate()
        if self.test:
            df = sqlContext.sql(
                "select * from cmp_tmp_user_identification where dt='2014-01'")
        else:
            df = sqlContext.sql("select * from cmp_tmp_user_identification")

        if self.tweet and self.retweet:
            df = df.withColumn('content', F.concat('text', 'retweeted'))
        elif self.tweet:
            df = df.filter("retweeted==' '")
            df = df.withColumn('content', F.col('text'))
        elif self.retweet:
            df = df.filter('length(retweeted)>1')
            df = df.withColumn('content', F.col('retweeted'))

        df = df.withColumn('content', textCut(clean_text('content')))
        ##stopwords
        remover = StopWordsRemover(inputCol="content",
                                   outputCol="words",
                                   stopWords=self.stopwords)
        df = remover.transform(df)
        ## 清理空字符
        df = df.filter('size(words)>0')
        self.sentence_length_distribution = df.selectExpr(
            'size(words) as wz').groupBy('wz').count().toPandas().set_index(
                'wz').sort_index()
        ###vec
        cv = CountVectorizer(inputCol='words',
                             outputCol='vertors',
                             minDF=self.minDF,
                             minTF=self.minTF)
        model_cv = cv.fit(df)
        word2bag = model_cv.vocabulary
        self.baglen = len(word2bag)
        self.dictionary = dict(
            zip(word2bag, ['W' + str(i) for i in range(1, self.baglen)]))
        sc = SparkContext.getOrCreate()
        diction = sc.broadcast(self.dictionary)

        ## to English format to GCN
        df = df.withColumn('words_space', toSpaceSplit('words'))
        result_df = df.selectExpr('uid,label,identity,words_space'.split(','))
        ##aggregate to user level
        result_df = result_df.groupBy('uid', 'label', 'identity').agg(
            F.collect_list('words_space').alias('uid_words'))
        result_df = result_df.withColumn('uid_words', concat_uid('uid_words'))
        return result_df
    def vectorize_data(self):
        """
        Convert each list of tokens into vectors of token counts

        :return: vectors of token counts
        """
        columns = self.df.schema.names
        for column_name in columns:
            if "_words" in column_name:
                count = CountVectorizer(inputCol=column_name,
                                        outputCol=column_name +
                                        "_raw_features")
                model = count.fit(self.df)
                self.df = model.transform(self.df)
                self.df.drop(column_name).collect()
Esempio n. 15
0
def vectorizeCV(fullDF, sampleDF, minDocFrec):

    vectorizer = CountVectorizer()
    cv = CountVectorizer(minDF=minDocFrec,
                         inputCol="raw",
                         outputCol="features",
                         binary=True)

    if sampleDF == None:
        model = cv.fit(fullDF)
    else:
        model = cv.fit(sampleDF)
    result = model.transform(fullDF)

    return result, model
 def preprocess(self,df):
     # convert input dataframe to document. 
     document_assembler = DocumentAssembler() \
         .setInputCol("headline_text") \
         .setOutputCol("document") \
         .setCleanupMode("shrink")
     # Split sentence to tokens(array)
     tokenizer = Tokenizer() \
         .setInputCols(["document"]) \
         .setOutputCol("token")
     # clean
     normalizer = Normalizer() \
         .setInputCols(["token"]) \
         .setOutputCol("normalized")
     # remove stopwords
     stopwords_cleaner = StopWordsCleaner() \
         .setInputCols("normalized") \
         .setOutputCol("cleanTokens") \
         .setCaseSensitive(False)
     # stem the words to bring them to the root form.
     stemmer = Stemmer() \
         .setInputCols(["cleanTokens"]) \
         .setOutputCol("stem")
     # bring back the expected structure viz. array of tokens.
     finisher = Finisher() \
         .setInputCols(["stem"]) \
         .setOutputCols(["tokens"]) \
         .setOutputAsArray(True) \
         .setCleanAnnotations(False)
     # build preprocess pipeline
     preprocess_pipeline = Pipeline(
         stages=[document_assembler, 
                 tokenizer,
                 normalizer,
                 stopwords_cleaner, 
                 stemmer, 
                 finisher])
     # train the pipeline
     preprocess = preprocess_pipeline.fit(df)
     # apply the pipeline to transform dataframe.
     processed_df  = preprocess.transform(df)
     # select the columns that we need
     tokens_df = processed_df.select('publish_date','tokens').limit(10000)
     cv = CountVectorizer(inputCol="tokens", outputCol="features", vocabSize=500, minDF=3.0)
     # train the model
     cv_model = cv.fit(tokens_df)
     # transform the data. Output column name will be features.
     vectorized_tokens = cv_model.transform(tokens_df)
Esempio n. 17
0
def main(context):
    """Main function takes a Spark SQL context."""
    """Task1"""
    #commentsDF = sqlContext.read.json("comments-minimal.json.bz2")
    #submissionsDF = sqlContext.read.json("submissions.json.bz2")
    #commentsDF.write.parquet("comments.parquet")
    #submissionsDF.write.parquet("submissions.parquet")
    labeledDF = sqlContext.read.format("csv").options(
        header='true', inferschema='true').load("labeled_data.csv")
    commentsDF = sqlContext.read.parquet("comments.parquet")
    submissionsDF = sqlContext.read.parquet("submissions.parquet")
    """Task2"""
    #data = labeled_data.join(comments, comments("id")===labeled_data("Input_id"), "inner").select("id","body","labeldem","labelgop","labeldjt")
    commentsDF.createOrReplaceTempView("comments")
    labeledDF.createOrReplaceTempView("labeled_data")
    dataDF = sqlContext.sql(
        "SELECT id, body, labeldem, labelgop, labeldjt FROM comments INNER JOIN labeled_data ON comments.id = labeled_data.Input_id"
    )
    '''drop the temp view to save memory (RAM)'''
    """Task4"""
    dataDF.createOrReplaceTempView("data")
    sqlContext.udf.register("sanitize_udf", sanitize)
    dataDF = sqlContext.sql("SELECT *, sanitize_udf(body) AS ngrams FROM data")
    """Task5"""
    dataDF.createOrReplaceTempView("data")
    sqlContext.udf.register("select_udf", select)
    data = sqlContext.sql(
        "SELECT id, body, labeldem, labelgop, labeldjt, select_udf(ngrams) AS selected_ngrams FROM data"
    )
    #data = sqlContext.sql("SELECT id, body, labeldem, labelgop, labeldjt, vectorize_udf(ngrams) AS vectorized_ngrams FROM data")
    """Task6A"""
    vectorized_data = data.withColumn(
        "selected_ngrams",
        split(col("selected_ngrams"), " ").cast(ArrayType(StringType())))
    cv = CountVectorizer(inputCol="selected_ngrams",
                         outputCol="vector",
                         minDF=5.0)
    cv_model = cv.fit(vectorized_data)
    vectorized = cv_model.transform(vectorized_data)
    #vectorized.show(1, truncate=False)
    """Task6B"""
    vectorized.createOrReplaceTempView("Vectorized")
    sqlContext.udf.register("check_pos", check_pos)
    sqlContext.udf.register("check_neg", check_neg)
    new_vectorized = sqlContext.sql(
        "SELECT id, body, labeldem, labelgop, labeldjt, selected_ngrams, vector, check_pos(labeldjt) AS pos_label, check_neg(labeldjt) AS neg_label FROM Vectorized"
    )
    new_vectorized.show(3, False)
Esempio n. 18
0
def main(*args):
    if len(args) != 2:
        print("Please provide one input and one output directories!")
        sys.exit(1)

    input_fn, output_fn = args[0],args[1]
    conf = SparkConf()
    conf.setAppName("grant")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    # Load the abstract content in the test folder into spark, 
    # clean text, tokenize the corpus, and stem the words
    abstract = sc.textFile(input_fn)
    df_abs = (abstract.map(lambda doc: text_cleaning(doc))
                      .filter(lambda doc: len(doc) > 0)
                      .filter(lambda line: not line.startswith('app'))
                      .map(lambda doc: doc.split(' '))
                      .map(lambda word: [x for x in word if len(x)>0])
                      .map(lambda word: stem(word))
                      .map(lambda doc: (int(doc[0]), doc[1:]))
                      .filter(lambda doc: len(doc[1])>0)
                      .toDF(['Id','words']))
    # build the pipeline and lda model with online optimizer
    stop_words = StopWordsRemover(inputCol='words',
                             outputCol='clean')
    stop_words.setStopWords(stop_words.loadDefaultStopWords('english'))
    countv = CountVectorizer(inputCol=stop_words.getOutputCol(), 
                             outputCol="tokens")
    idf = IDF(inputCol=countv.getOutputCol(),outputCol="features")
    lda = LDA(maxIter=10,k=10,optimizer='online')
    pipeline = Pipeline(stages=[stop_words, countv, idf, lda])
    lda_model = pipeline.fit(df_abs)
    labels = lda_model.transform(df_abs)
    
    # identify the label as the topic with the max probability
    # save the label to file
    topic_labels = (labels.select('Id','topicDistribution')
                          .rdd
                          .map(lambda x: (x[0],np.argmax(x[1])))
                          .saveAsTextFile(os.path.join(output_fn,'labels')))
    # Get the topics
    wordnum = 5 # choose the number of topic words
    vocabulary = lda_model.stages[1].vocabulary
    voc_bv = sc.broadcast(vocabulary)
    topic_df = (lda_model.stages[3].describeTopics(wordnum)
                     .rdd
                     .map(lambda x: (x[0],[voc_bv.value[Id] for Id in x[1]],x[2]))
                     .saveAsTextFile(os.path.join(output_fn,'words')))
Esempio n. 19
0
def LDAThis(sc, RDD, minFreq, numTopics, maxIter, wordsPerTopic):
    '''
Arguments:
     sc: A SparkContext Object
     RDD: An RDD with rows as tokenized sentences
     minFreq: Minimum document frequency for CountVectorizer
     numTopics: Number of Topics
     maxIter: Max number of iterations for LDA train
     wordsPerTopic: Number of words to show per topic
     topWords: Number of words to show per topic
Requirements
     sqlContext = SQLContext(sc) <- must be defined outside function
     '''
    StopWords = stopwords.words("english")
    sqlContext = SQLContext(sc)
    # Structure Data
    idRDD = RDD.map(
        lambda words: [x for x in words if x.isalpha() and x not in StopWords
                       ]).filter(lambda x: len(x) > 2).zipWithIndex()
    idDF = sqlContext.createDataFrame(idRDD, ["tokens", 'index'])
    # Term Frequency
    CVecModel = CountVectorizer(inputCol="tokens",
                                outputCol="rawFeatures",
                                vocabSize=5000,
                                minDF=minFreq).fit(idDF)
    resultCVec = CVecModel.transform(idDF)
    vocabArray = CVecModel.vocabulary
    #IDF
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(resultCVec)
    resultTFIDF = idfModel.transform(resultCVec)
    # LDA
    resultLDA = LDA.train(resultTFIDF.select(
        'index', 'features').rdd.mapValues(Vectors.fromML).map(list),
                          k=numTopics,
                          maxIterations=maxIter)
    topicIndices = sc.parallelize(
        resultLDA.describeTopics(maxTermsPerTopic=wordsPerTopic))
    topicsFinal = topicIndices.map(lambda topic: render_topics(
        topic, wordsPerTopic, vocabArray)).collect()

    # Show Topics
    for topic in range(len(topicsFinal)):
        print("Topic" + str(topic) + ":")
        for term in topicsFinal[topic]:
            print(term)
        print('\n')
    return resultLDA
Esempio n. 20
0
def preprocess(inputCol=["text", "label"], n=4):
    tokenizer = [Tokenizer(inputCol="text", outputCol="words")]
    remover = [StopWordsRemover(inputCol="words", outputCol="filtered")]
    ngrams = [
        NGram(n=i, inputCol="filtered", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]

    cv = [
        CountVectorizer(vocabSize=2**14,
                        inputCol="{0}_grams".format(i),
                        outputCol="{0}_tf".format(i)) for i in range(1, n + 1)
    ]
    idf = [
        IDF(inputCol="{0}_tf".format(i),
            outputCol="{0}_tfidf".format(i),
            minDocFreq=2) for i in range(1, n + 1)
    ]

    assembler = [
        VectorAssembler(
            inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)],
            outputCol="rawFeatures")
    ]
    label_stringIdx = [StringIndexer(inputCol="label", outputCol="labels")]
    selector = [
        ChiSqSelector(numTopFeatures=2**14,
                      featuresCol='rawFeatures',
                      outputCol="features")
    ]
    lr = [LogisticRegression(maxIter=1000)]
    return Pipeline(stages=tokenizer + remover + ngrams + cv + idf +
                    assembler + label_stringIdx + selector + lr)
Esempio n. 21
0
def base_features_gen_pipeline(input_descript_col="descript",
                               input_category_col="category",
                               output_feature_col="features",
                               output_label_col="label"):
    '''
    Token -> Vectors -> Label -> selector/transformer -> pipeline 
    '''

    #tokenizing the reviews in the input
    word_tokenizer = Tokenizer(inputCol="descript", outputCol="words")

    #Count Vectorizing using Bag of Words model
    count_vectors = CountVectorizer(inputCol="words", outputCol="features")

    #Labelling data for supervised learning
    label_maker = StringIndexer(inputCol="category", outputCol="label")

    #Transformer
    selector = Selector(outputCols=['id', 'features', 'label'])

    #constructing the data
    pipeline = Pipeline(
        stages=[word_tokenizer, count_vectors, label_maker, selector])

    return pipeline
Esempio n. 22
0
def run_ml_pipeline(nlpPipelineDF, num_topics, max_iterations, vocabSize,
                    minDF, maxDF):
    """Define a Spark LDA topic modelling pipeline"""
    cv = CountVectorizer(
        inputCol="allTokens",
        outputCol="features",
        vocabSize=vocabSize,
        minDF=minDF,
        maxDF=maxDF,
        minTF=1.0,
    )
    idf = IDF(inputCol="features", outputCol="idf")
    lda = LDA(
        k=num_topics,
        maxIter=max_iterations,
        optimizer="online",
        seed=1,
        learningOffset=
        100.0,  # If high, early iterations are downweighted during training
        learningDecay=
        0.51,  # Set between [0.5, 1) to guarantee asymptotic convergence
    )

    mlPipeline = Pipeline(stages=[cv, idf, lda])
    mlModel = mlPipeline.fit(nlpPipelineDF)
    ldaModel = mlModel.stages[2]
    return mlModel, ldaModel
Esempio n. 23
0
def fit_tfidf_pipeline(content_df):
    tokenizer = RegexTokenizer(). \
        setGaps(False). \
        setPattern('\\p{L}+'). \
        setInputCol('content'). \
        setOutputCol('words')

    sw = StopWordsRemover() \
        .setStopWords(stop_words) \
        .setCaseSensitive(False) \
        .setInputCol("words") \
        .setOutputCol("filtered")

    cv = CountVectorizer(). \
        setInputCol('filtered'). \
        setOutputCol('tf'). \
        setMinTF(1). \
        setMinDF(10). \
        setVocabSize(2 ** 17)

    # fit dataframe_df
    cv_transformer = Pipeline(stages=[tokenizer, sw, cv]).fit(content_df)

    idf = IDF(minDocFreq=10). \
        setInputCol('tf'). \
        setOutputCol('tfidf')

    tfidf_transformer = Pipeline(stages=[cv_transformer, idf]).fit(content_df)

    return tfidf_transformer
Esempio n. 24
0
def UsefulnessPredictionLDA(trainingdata, model):
    # Data Preprocessing
    tokenizer = Tokenizer(inputCol="review_text", outputCol="tokens_word")

    remover = StopWordsRemover(inputCol="tokens_word",
                               outputCol="filtered_tokens_word")
    cv = CountVectorizer(inputCol="filtered_tokens_word",
                         outputCol="raw_features",
                         minDF=2.0)
    idf = IDF(inputCol="raw_features", outputCol="features")

    # Extract LDA topic feature
    lda = LDA(k=30, maxIter=10)
    if model == 'RandomForest':
        model = RandomForestRegressor(featuresCol="topicDistribution")
    pipeline = Pipeline(stages=[tokenizer, remover, cv, idf, lda, model])
    evaluator_rmse = RegressionEvaluator(labelCol="label",
                                         predictionCol="prediction",
                                         metricName="rmse")
    paramGrid = ParamGridBuilder() \
        .addGrid(cv.vocabSize, [150, 200, 250]) \
        .build()
    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=evaluator_rmse,
                              numFolds=4)  # use 3+ folds in practice
    cvModel = crossval.fit(trainingdata)
    # Explain params for the selected model
    print cvModel.explainParams()
    return cvModel
Esempio n. 25
0
def main():
    subreddit_group = spark.read.parquet(input_file).repartition(2000)
    # subreddit_group.show()

    #hashing = HashingTF(inputCol="comments", outputCol="features")
    count_vectorizer = CountVectorizer(inputCol="comments",
                                       outputCol="features")

    lda = LDA(k=10, maxIter=10, optimizer='online')

    pipeline = Pipeline(stages=[count_vectorizer, lda])
    model = pipeline.fit(subreddit_group)

    predictions = model.transform(subreddit_group).selectExpr(
        'id', 'topicDistribution')

    change_to_str = F.udf(to_text)

    topics_df = predictions.select(
        predictions['id'],
        change_to_str(
            predictions['topicDistribution']).alias('topicDistribution'))

    #topics_df.show(20, False)
    topics_df.write.option('sep', ',').save(output,
                                            format='csv',
                                            mode='overwrite')
Esempio n. 26
0
def main():
    spark = SparkSession.builder.appName('nlp').getOrCreate()
    data = spark.read.csv("./data/smsspamcollection/SMSSpamCollection",
                          inferSchema=True, sep='\t')
    data = data.withColumnRenamed('_c0', 'class').withColumnRenamed('_c1',
                                                                    'text')
    data.show()
    data = data.withColumn('length', length(data['text']))
    data.show()
    data.groupby('class').mean().show()
    tokenizer = Tokenizer(inputCol="text", outputCol="token_text")
    stopremove = StopWordsRemover(inputCol='token_text',
                                  outputCol='stop_tokens')
    count_vec = CountVectorizer(inputCol='stop_tokens', outputCol='c_vec')
    idf = IDF(inputCol="c_vec", outputCol="tf_idf")
    ham_spam_to_num = StringIndexer(inputCol='class', outputCol='label')
    clean_up = VectorAssembler(inputCols=['tf_idf', 'length'],
                               outputCol='features')
    nb = NaiveBayes()
    data_prep_pipe = Pipeline(stages=[ham_spam_to_num, tokenizer, stopremove,
                                      count_vec, idf, clean_up])
    cleaner = data_prep_pipe.fit(data)
    clean_data = cleaner.transform(data)
    clean_data = clean_data.select(['label', 'features'])
    clean_data.show()
    (training, testing) = clean_data.randomSplit([0.7, 0.3])
    spam_predictor = nb.fit(training)
    data.printSchema()
    test_results = spam_predictor.transform(testing)
    test_results.show()
    acc_eval = MulticlassClassificationEvaluator()
    acc = acc_eval.evaluate(test_results)
    print("Accuracy of model at predicting spam was: {}".format(acc))
Esempio n. 27
0
def build_pipeline():
    tokenizer = [Tokenizer(inputCol='tweet', outputCol='words')]
    ngrams = [
        NGram(n=i, inputCol='words', outputCol='{0}_grams'.format(i))
        for i in range(1, 4)
    ]
    cv = [
        CountVectorizer(vocabSize=5460,
                        inputCol='{0}_grams'.format(i),
                        outputCol='{0}_tf'.format(i)) for i in range(1, 4)
    ]
    idf = [
        IDF(inputCol='{0}_tf'.format(i),
            outputCol='{0}_tfidf'.format(i),
            minDocFreq=5) for i in range(1, 4)
    ]
    assembler = [
        VectorAssembler(inputCols=['{0}_tfidf'.format(i) for i in range(1, 4)],
                        outputCol='features')
    ]
    label_stringIdx = [StringIndexer(inputCol='sentiment', outputCol='label')]
    lr = [LogisticRegression(maxIter=100)]
    pipeline = Pipeline(stages=tokenizer + ngrams + cv + idf + assembler +
                        label_stringIdx + lr)
    return pipeline
Esempio n. 28
0
def base_features_gen_pipeline(input_descript_col="descript",
                               input_category_col="category",
                               output_feature_col="features",
                               output_label_col="label"):
    #Build the pipeline
    # white space expression tokenizer
    word_tokenizer = Tokenizer(inputCol="descript", outputCol="words")

    # bag of words count
    count_vectors = CountVectorizer(inputCol="words", outputCol="features")

    # label indexer
    label_maker = StringIndexer(inputCol="category", outputCol="label")

    class Selector(Transformer):
        def __init__(self, outputCols=['id', 'features', 'label']):
            self.outputCols = outputCols

        def _transform(self, df: DataFrame) -> DataFrame:
            return df.select(*self.outputCols)

    selector = Selector(outputCols=['id', 'features', 'label'])

    # build the pipeline
    pipeline = Pipeline(
        stages=[word_tokenizer, count_vectors, label_maker, selector])

    return pipeline
Esempio n. 29
0
def build_ngrams(n=3):
    tokenizer = [Tokenizer(inputCol="text", outputCol="tokens")]
    stopwordsRemover = [
        StopWordsRemover(inputCol='tokens', outputCol='tokens_filtered')
    ]
    ngrams = [
        NGram(n=i, inputCol="tokens", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]
    cv = [
        CountVectorizer(vocabSize=5460,
                        inputCol="{0}_grams".format(i),
                        outputCol="{0}_cv".format(i)) for i in range(1, n + 1)
    ]
    idf = [
        IDF(inputCol="{0}_cv".format(i),
            outputCol="{0}_idf".format(i),
            minDocFreq=5) for i in range(1, n + 1)
    ]
    assembler = [
        VectorAssembler(
            inputCols=["{0}_idf".format(i) for i in range(1, n + 1)],
            outputCol="features")
    ]

    stringIndexer = [StringIndexer(inputCol="class", outputCol="label")]
    lr = [LogisticRegression(maxIter=100)]

    return Pipeline(stages=tokenizer + ngrams + cv + idf + assembler +
                    stringIndexer + lr)
Esempio n. 30
0
def train_idf_model():
    rests = biz.filter(if_rest_udf(biz.categories))

    rest_rev = rev.join(
        rests.select('business_id',
                     'stars').withColumnRenamed('stars', 'rating'),
        'business_id')
    bad_reviews = rest_rev.filter('stars < 3')

    #sample for train

    bad_sample = bad_reviews.sample(False, 0.127, seed=91)
    sample_token = data_tokenizer(bad_sample)
    splits = sample_token.randomSplit([0.8, 0.1, 0.1], seed=91)

    train = splits[0]
    add_cl = splits[1]
    test = splits[2]

    cv = CountVectorizer(minDF=5,
                         vocabSize=5000,
                         inputCol='token',
                         outputCol='vectors')
    idf = IDF(minDocFreq=7, inputCol="vectors", outputCol="features")
    km2 = KMeans(k=18, featuresCol='features', maxIter=30)
    pipe_idf = Pipeline(stages=[cv, idf, km2])

    pipe_idf_model = pipe_idf.fit(train)
    return pipe_idf
Esempio n. 31
0
def ngramFeatureExtractors(n, inputCol=["text", "target"]):
    tokenizer = [Tokenizer(inputCol="text", outputCol="words")]
    ngrams = [
        NGram(n=i, inputCol="words", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]

    count_vectorizer = [
        CountVectorizer(vocabSize=5460,
                        inputCol="{0}_grams".format(i),
                        outputCol="{0}_tf".format(i)) for i in range(1, n + 1)
    ]
    idf = [
        IDF(inputCol="{0}_tf".format(i),
            outputCol="{0}_tfidf".format(i),
            minDocFreq=5) for i in range(1, n + 1)
    ]

    assembler = [
        VectorAssembler(
            inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)],
            outputCol="features")
    ]
    label_stringIdx = [StringIndexer(inputCol="target", outputCol="label")]
    lr = [LogisticRegression(maxIter=100)]
    return Pipeline(stages=tokenizer + ngrams + count_vectorizer + idf +
                    assembler + label_stringIdx + lr)
Esempio n. 32
0
File: ml.py Progetto: ribonj/lsir
def count(df, column):
    """
    Count the number of occurences of terms in documents.
    """
    # fit a CountVectorizerModel from the corpus.
    # vocabSize: top N words orderedby term frequency across the corpus
    # minDF: minimum number of documents a term must appear in to be 
    #   included in the vocabulary
    # e.g. vocabSize=10, minDF=2.0
    cv = CountVectorizer(inputCol=column, 
                         outputCol='_'+column)
    
    model = cv.fit(df)
    voc = model.vocabulary
    df = model.transform(df)
    
    df = replace(df, column, '_'+column)
    return (df, voc)
Esempio n. 33
0
def featurizeData(raw, gap, vocabFile, featFile):
    feats = raw.dropDuplicates(['cluster', 'series', 'date'])\
            .withColumn('day', datediff(col('date'), lit('1970-01-01')))\
            .na.drop(subset=['day'])\
            .rdd.groupBy(lambda r: r.cluster)\
            .flatMap(lambda c: clusterFeatures(c, gap))\
            .toDF()

    feats.cache()
    cv = CountVectorizer(inputCol='raw', outputCol='features', minDF=4.0)
    interner = cv.fit(feats)      # alternate possibility: grab features only from label==1 edges
    full = interner.transform(feats)
    # combiner = VectorAssembler(inputCols=realCols + ['categorial'], outputCol='features')
    # # I don't think a Pipeline will work here since we need to get the interner.vocabulary
    # full = combiner.transform(interner.transform(feats)).drop('categorial')

    full.write.parquet(featFile)
    np.savetxt(vocabFile, np.array(interner.vocabulary), fmt='%s')
    feats.unpersist()
Esempio n. 34
0
def get_top_words(dataset, signatures):
    # TODO: Use stemmers for the languages supported by http://www.nltk.org/api/nltk.stem.html#nltk.stem.snowball.SnowballStemmer
    # Or translate comments in other languages using the free Microsoft Translate API.
    sentenceData = dataset.filter(dataset['user_comments'].isNotNull() & (dataset['useragent_locale'].isNull() | (functions.instr(dataset['useragent_locale'], 'en') == 1)))

    if sentenceData.rdd.isEmpty():
        return dict()

    # Tokenize comments.
    tokenizer = Tokenizer(inputCol='user_comments', outputCol='words')
    wordsData = tokenizer.transform(sentenceData)

    # Remove duplicate words from comments.
    wordsData = wordsData.rdd.map(lambda p: (p['signature'], list(set(p['words'])))).reduceByKey(lambda x, y: x + y).toDF(['signature', 'words'])

    if wordsData.rdd.isEmpty():
        print("[WARNING]: wordsData is empty, sentenceData wasn't.")
        return dict()

    # Clean comment words by removing puntuaction and stemming.
    def clean_word(w):
        return re.sub('\,|\.|\;|\:|\;|\?|\!|\[|\]|\}|\{|\/|\\\\', '', stem(w.lower()))

    wordsData = wordsData.rdd.map(lambda p: (p['signature'], [clean_word(w) for w in p['words']])).toDF(['signature', 'words'])

    # XXX: Useless with TF-IDF?
    remover = StopWordsRemover(inputCol='words', outputCol='filtered')
    cleanWordsData = remover.transform(wordsData)

    cv = CountVectorizer(inputCol='filtered', outputCol='features')
    model = cv.fit(cleanWordsData)
    featurizedData = model.transform(cleanWordsData)

    idf = IDF(inputCol='features', outputCol='tfidf_features')
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    bests_per_doc = rescaledData.filter(rescaledData.signature.isin(signatures)).rdd.map(lambda p: (p['signature'], sorted(zip(p['tfidf_features'].indices, p['tfidf_features'].values), key=lambda i: i[1], reverse=True)[:10])).collect()

    return dict([(signature, [model.vocabulary[best] for best, val in bests]) for signature, bests in bests_per_doc])
Esempio n. 35
0
def main():
	p = sys.argv[1]
	logFile = "data/" + p + "_cleaned.txt"
	sc = SparkContext("local", "simpleApp")
	sqlContext = SQLContext(sc)
	data = sc.textFile(logFile).zipWithIndex().map(lambda (words,idd): Row(idd= idd, words = words.split(" "))).cache()
	docDF = sqlContext.createDataFrame(data)
	Vector = CountVectorizer(inputCol="words", outputCol="vectors")
	model = Vector.fit(docDF)
	result = model.transform(docDF)
	corpus_size = result.count()

	corpus = result.select("idd", "vectors").map(lambda (x,y): [x,y]).cache()

	# Cluster the documents into three topics using LDA
	ldaModel = LDA.train(corpus, k=3,maxIterations=100,optimizer='online')
	topics = ldaModel.topicsMatrix()
	wordNumbers = 10
	topicIndices = sc.parallelize(ldaModel.describeTopics(maxTermsPerTopic = wordNumbers))
	vocabArray = model.vocabulary
	topics_final = topicIndices.map(lambda topic: topic_render(topic,wordNumbers,vocabArray)).collect()

	path = "data/" + p + "_results.txt"
	json = open(path, 'wb')
	json.close()

	for topic in topics_final:
		for term in topic:
			line = term[0] + " "

			try:
				string_for_output = line.encode('utf8', 'replace')
				if string_for_output != " ":
					os.system("python3 basic/codes/p3p.py " +  string_for_output + "  >> " + path)
			except: pass

		os.system("python3 basic/codes/p3p.py " +  "delmch" + "  >> " + path)
Esempio n. 36
0
    def test_count_vectorizer_with_maxDF(self):
        dataset = self.spark.createDataFrame([
            (0, "a b c d".split(' '), SparseVector(3, {0: 1.0, 1: 1.0, 2: 1.0}),),
            (1, "a b c".split(' '), SparseVector(3, {0: 1.0, 1: 1.0}),),
            (2, "a b".split(' '), SparseVector(3, {0: 1.0}),),
            (3, "a".split(' '), SparseVector(3,  {}),)], ["id", "words", "expected"])
        cv = CountVectorizer(inputCol="words", outputCol="features")
        model1 = cv.setMaxDF(3).fit(dataset)
        self.assertEqual(model1.vocabulary, ['b', 'c', 'd'])

        transformedList1 = model1.transform(dataset).select("features", "expected").collect()

        for r in transformedList1:
            feature, expected = r
            self.assertEqual(feature, expected)

        model2 = cv.setMaxDF(0.75).fit(dataset)
        self.assertEqual(model2.vocabulary, ['b', 'c', 'd'])

        transformedList2 = model2.transform(dataset).select("features", "expected").collect()

        for r in transformedList2:
            feature, expected = r
            self.assertEqual(feature, expected)
Esempio n. 37
0
from pyspark.sql import SQLContext, Row
from pyspark.ml.feature import CountVectorizer
from pyspark.mllib.clustering import LDA, LDAModel

sqlContext = SQLContext(sc)
path = ... # path of the txt file

data = sc.textFile(path).zipWithIndex().map(lambda (words,idd): Row(idd= idd, words = words.split(" ")))
docDF = sqlContext.createDataFrame(data)

Vector = CountVectorizer(inputCol="words", outputCol="vectors")
model = Vector.fit(docDF)
result = model.transform(docDF)

corpus_size = result.count()  # total number of words
corpus = result.select("idd", "vectors").map(lambda (x,y): [x,y]).cache()

# Cluster the documents into three topics using LDA
ldaModel = LDA.train(corpus, k=3,maxIterations=100,optimizer='online')
topics = ldaModel.topicsMatrix()
vocabArray = model.vocabulary

wordNumbers = 10  # number of words per topic
topicIndices = sc.parallelize(ldaModel.describeTopics(maxTermsPerTopic = wordNumbers))

def topic_render(topic):  # specify vector id of words to actual words
    terms = topic[0]
    result = []
    for i in range(wordNumbers):
        term = vocabArray[terms[i]]
        result.append(term)
def train_cv_model(modelDataframe):
    cv = CountVectorizer(inputCol="udf_results", outputCol="features", binary=True, minDF=5.0)
    model = cv.fit(modelDataframe)
    model.write().overwrite().save("models/cvModel")
Esempio n. 39
0
    argparser.add_argument('-s', '--clusterSize', type=int, default=1)
    argparser.add_argument('indir', help='Input directory')
    argparser.add_argument('outdir', help='Output directory')
    args = argparser.parse_args()

    spark = SparkSession.builder.appName('Cluster Features').getOrCreate()

    df = spark.read.load(args.indir)

    raw = df.filter(col('size') >= args.clusterSize) \
            .select('cluster', 'size', regexp_replace('text', u'\xad\s*', '').alias('text'))
    raw.cache()

    tok = RegexTokenizer(inputCol='text', outputCol='terms', gaps=False, pattern='\w+') \
          .transform(raw)
    counts = CountVectorizer(inputCol='terms', outputCol='counts', minDF=2.0) \
             .fit(tok).transform(tok)
    
    mergeCounts = udf(lambda va, size: threshold_sparse(scale_sparse(reduce(add_sparse, va), 1.0/size), args.minCount),
                      VectorUDT())

    res = counts.groupBy('cluster', 'size') \
                .agg(mergeCounts(collect_list('counts'), 'size').alias('counts'))

    # lda = LDA(k=2, featuresCol='counts', seed=1, optimizer='em')
    # model = lda.fit(res)

    # model.describeTopics().write.json(args.outdir)

    res.write.json(args.outdir)

    spark.stop()
Esempio n. 40
0
	def trainModel(self):
		
		logger.info("Training the model...")		

		query = '''select page_id, max(page_title) as page_title from cooladata where date_range(all) and page_id is not null group by page_id;'''

		def SQLtoURL(query):
    
    			data = query.replace('\n', ' ').replace('\t',' ').replace('   ',' ').replace('  ',' ')
    			return data


		def QueryXXXXX(query, file = None):
   
    			session = Session()
    			response = session.post(data = {'tq': query,}, url = 'https://app.XXXXXX.com/api/v2/projects/115659/cql/', headers = {'Authorization': 'Token dtQvPVejNcSebX1EkU0AqB2TJRXznIgZiDvDu3HR'},)
    			return response.content

		table = json.loads(codecs.decode(QueryCoola(SQLtoURL(query)),'utf-8'))['table']
		title_list = [x['c'] for x in table['rows']]
		table_cols = [d['label'] for d in table['cols']]  
		def convert_row(row):
    			rowlist = [d['v'] for d in row]
    			return rowlist

		rd = self.sc.parallelize(title_list).map(convert_row)
		titleData = self.spark.createDataFrame(rd, table_cols)
		titleData = titleData.dropna()
		
		hebrew_stopwords = stop_words()
		def rmv(words):
    			for punc in punctuation:
        			words = words.replace(punc,"")
    			for hword in hebrew_stopwords:
        			words = words.replace(hword, " ")
    			return words

		self.spark.udf.register("rmv", rmv, StringType())
		titleData.registerTempTable("wordstable")
		cleanedSentenceData = self.spark.sql("select page_id, page_title, rmv(page_title) as cleanedSentence from wordstable")
		tokenizer = Tokenizer(inputCol="cleanedSentence", outputCol="words")
		wordsData = tokenizer.transform(cleanedSentenceData)

		cv = CountVectorizer(inputCol="words", outputCol="rawFeatures", minDF = 2.0)
		cvModel = cv.fit(wordsData)
		featurizedData = cvModel.transform(wordsData)

		idf = IDF(inputCol="rawFeatures", outputCol="features")
		idfModel = idf.fit(featurizedData)
		rescaledData = idfModel.transform(featurizedData)

		lda = LDA(k=100)
		ldaModel = lda.fit(rescaledData)
		postFactorizedData = ldaModel.transform(rescaledData)

		norm = Normalizer(inputCol = "topicDistribution", outputCol="normTopicDist")
		scaledFactorizedNormalizedData = norm.transform(postFactorizedData)
		
		self.model = scaledFactorizedNormalizedData
		
		logger.info("model is built!")
    # alltags=tags_users.map(lambda x:Counter(x.tags)).reduce(lambda a,b:a+b)
    # print(alltags.most_common(10))
        #.filter(lambda x:len(x.tags)>100) # filtering to get smaller dataset

    # print(tags_users.count())
    # print(tags_users.first())

    ## Filtered for testing

    tags_users_df=sqlContext.createDataFrame(tags_users)
    print(tags_users_df.take(2))
    #
    #
    # print('Indexing strings')
    cVec = CountVectorizer(inputCol='tags', outputCol="tag_features",minDF=10.)
    model=cVec.fit(tags_users_df)
    td=model.transform(tags_users_df)

    with open('/home/erlenda/data/konsum/countvec_vocabulary.pkl',mode='wb') as ff:
        pkl.dump(model.vocabulary,ff)



    normalizer=Normalizer(p=1.,inputCol='tag_features',outputCol='tags_normalized')
    tdNorm=normalizer.transform(td)
    print(tdNorm.take(5))

    tdNorm.write.save('/home/erlenda/data/konsum/tag_profiler_parquet')

    samples=tdNorm.filter(tdNorm.posts_with_tags>10).take(10)
print model.weights
model.gaussiansDF.show()
summary.cluster.show()
summary.clusterSizes
summary.probability.show()


# COMMAND ----------

from pyspark.ml.feature import Tokenizer, CountVectorizer
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn.transform(sales.drop("features"))
cv = CountVectorizer()\
  .setInputCol("DescOut")\
  .setOutputCol("features")\
  .setVocabSize(500)\
  .setMinTF(0)\
  .setMinDF(0)\
  .setBinary(True)
cvFitted = cv.fit(tokenized)
prepped = cvFitted.transform(tokenized)


# COMMAND ----------

from pyspark.ml.clustering import LDA
lda = LDA().setK(10).setMaxIter(5)
print lda.explainParams()
model = lda.fit(prepped)

# COMMAND ----------

from pyspark.ml.feature import NGram
unigram = NGram().setInputCol("DescOut").setN(1)
bigram = NGram().setInputCol("DescOut").setN(2)
unigram.transform(tokenized.select("DescOut")).show(False)
bigram.transform(tokenized.select("DescOut")).show(False)


# COMMAND ----------

from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer()\
  .setInputCol("DescOut")\
  .setOutputCol("countVec")\
  .setVocabSize(500)\
  .setMinTF(1)\
  .setMinDF(2)
fittedCV = cv.fit(tokenized)
fittedCV.transform(tokenized).show(False)


# COMMAND ----------

tfIdfIn = tokenized\
  .where("array_contains(DescOut, 'red')")\
  .select("DescOut")\
  .limit(10)
tfIdfIn.show(10, False)

Esempio n. 44
0
from __future__ import print_function

from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import CountVectorizer
# $example off$

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("CountVectorizerExample")\
        .getOrCreate()

    # $example on$
    # Input data: Each row is a bag of words with a ID.
    df = spark.createDataFrame([
        (0, "a b c".split(" ")),
        (1, "a b b c a".split(" "))
    ], ["id", "words"])

    # fit a CountVectorizerModel from the corpus.
    cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2.0)

    model = cv.fit(df)

    result = model.transform(df)
    result.show(truncate=False)
    # $example off$

    spark.stop()
#tokenizer = Tokenizer(inputCol="description", outputCol="words")
#wordsData = tokenizer.transform(text)

################################################################################################
#
#   Generate TFIDF
#
################################################################################################

# Term Frequency Vectorization  - Option 1 (Using hashingTF): 
#hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
#featurizedData = hashingTF.transform(clean_text)

# Term Frequency Vectorization  - Option 2 (CountVectorizer)    : 
cv = CountVectorizer(inputCol="words", outputCol="rawFeatures", vocabSize = 1000)
cvmodel = cv.fit(clean_text)
featurizedData = cvmodel.transform(clean_text)

vocab = cvmodel.vocabulary
vocab_broadcast = sc.broadcast(vocab)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

################################################################################################
#
#   LDA Clustering - Find Data-driven Topics
#
################################################################################################