def tfidf_lda(df): ''' TFIDF+LDA :param df: :return: model ''' # hashingTF hashingTF = HashingTF(inputCol="content", outputCol="features") df_TF = hashingTF.transform(df) print('df_TF') df_TF.show(truncate=False) # IDF idf = IDF(inputCol="features", outputCol="idf") model_idf = idf.fit(df_TF) df_idf = model_idf.transform(df_TF) print('df_idf') df_idf.cache() df_idf.show(truncate=False) # LDA lda = LDA(k=20, seed=1, optimizer="em") model_lda = lda.fit(df_idf) model_lda.describeTopics(maxTermsPerTopic=20) df_lda = model_lda.transform(df_idf) df_lda.select("content", "topicDistribution").show(truncate=False) sparkEntrance.spark.createDataFrame(df_lda.rdd, ['content', 'topicDistribution'])
def _fit(self, papers): """ Build a LDA representation for each paper in the input data set. Based on papers in the papers corpus, a set of all terms is extracted. For each of them a unique id is generated. Term ids are sequential. Then depending on all terms and their frequence for a paper, a sparse vector is built. A model that can be used to map a tf vector to each paper based on its paper id is used. Based on the tf representation of all papers - LDA is trained and used for prodicing LDA representation. :param data set: input data set, which is an instance of :py:class:`pyspark.sql.DataFrame` :returns: a build model which can be used for transformation of a data set """ Logger.log("Train/Transform TF vectorizer.") tfVectorizer = TFVectorizer(self.papers_corpus, paperId_col = self.paperId_col, tf_map_col = self.tf_map_col, output_col = "tf_vector") tfVectorizerModel = tfVectorizer.fit(papers) # paper_id | tf_vector papers_tf_vectors = tfVectorizerModel.transform(papers).select(self.paperId_col, "tf_vector") papers_tf_vectors.cache() Logger.log("Train LDA. Topics:" + str(self.k_topics)) # Trains a LDA model. # The number of topics to infer. Must be > 1. lda = LDA(featuresCol = "tf_vector", k = self.k_topics) model = lda.fit(papers_tf_vectors) Logger.log("Transform LDA over paper corpus.") # format -> paper_id | lda_vector papers_lda_vectors = model.transform(papers_tf_vectors).withColumnRenamed("topicDistribution", self.output_col).drop("tf_vector") Logger.log("Return LDA model.") papers_tf_vectors.unpersist() return LDAModel(papers_lda_vectors, self.paperId_col, self.output_col);
def lda(features, num_clusters): """Does clustering on the features dataset using LDA topic clustering. Params: - features (pyspark.sql.DataFrame): The data frame containing the features to be used for clustering - num_clusters (int): The number of clusters to be used Returns: - clustered (pyspark.sql.DataFrame): The data frame, with the predicted clusters in a 'cluster' column """ lda = LDA(k=num_clusters, featuresCol='features', topicDistributionCol='topics') lda_model = lda.fit(features) clustered = lda_model.transform(features) clustered = clustered.rdd.map( lambda row: Row(cluster=int(argmax(row['topics'])), **row.asDict())) clustered = clustered.map( lambda row: Row(closeness=float(row['topics'][row['cluster']]), **row.asDict())).toDF() clustered = clustered.drop('topics') clustered.show() print("=====Clustering Results=====") print("LDA log perplexity = ", lda_model.logPerplexity(features)) cluster_sizes = list() for i in range(num_clusters): cluster_size = clustered.rdd.filter( lambda row: row['cluster'] == i).count() cluster_sizes.append(cluster_size) print("Cluster sizes = ", cluster_sizes) # Do an argmax over the clusters to get the actual topic, I guess return clustered
def test_persistence(self): # Test save/load for LDA, LocalLDAModel, DistributedLDAModel. df = self.spark.createDataFrame([ [1, Vectors.dense([0.0, 1.0])], [2, Vectors.sparse(2, {0: 1.0})], ], ["id", "features"]) # Fit model lda = LDA(k=2, seed=1, optimizer="em") distributedModel = lda.fit(df) self.assertTrue(distributedModel.isDistributed()) localModel = distributedModel.toLocal() self.assertFalse(localModel.isDistributed()) # Define paths path = tempfile.mkdtemp() lda_path = path + "/lda" dist_model_path = path + "/distLDAModel" local_model_path = path + "/localLDAModel" # Test LDA lda.save(lda_path) lda2 = LDA.load(lda_path) self._compare(lda, lda2) # Test DistributedLDAModel distributedModel.save(dist_model_path) distributedModel2 = DistributedLDAModel.load(dist_model_path) self._compare(distributedModel, distributedModel2) # Test LocalLDAModel localModel.save(local_model_path) localModel2 = LocalLDAModel.load(local_model_path) self._compare(localModel, localModel2) # Clean up try: rmtree(path) except OSError: pass
def lda_train(self, file): json_rdd, count = self.load_train_titleFeature_rdd(file) vocabulary_set = json_rdd.map(lambda line : get_title_words(line))\ .flatMap(lambda word : word).distinct().collect() vocab_size = self.sc.broadcast(max(vocabulary_set) + 1) print('vocabulart size: ' + str(vocab_size.value)) sparseVec_rdd = json_rdd.map(lambda line : cast_dict_str2int(line.get('title_features')))\ .map(lambda value : SparseVector(vocab_size.value, value)) zip_rdd = sparseVec_rdd.zipWithIndex() lda_train_rdd = zip_rdd.map(lambda x: [x[1], x[0]]).cache() K = 4 max_iter = 10 seed = 1024 lda_train_df = self.sqlContext.createDataFrame(lda_train_rdd.collect(), ["id", "features"]) lda = LDA(k=K, maxIter=max_iter, seed=seed) lda_model = lda.fit(lda_train_df) print('LDA model vocabSize : ' + str(lda_model.vocabSize())) print(lda_model.isDistributed()) lda_model.describeTopics().show() #os.system("hadoop fs -rmr {}".format(self.lda_model_path)) #os.system("hadoop fs -rmr {}".format(self.lda_path)) lda_model.write().overwrite().save(self.lda_model_path) self.sc.stop()
def lda(df, column): df = preprocess(df, column) # text to list of terms (df, voc) = count(df, column) # add a feature column containing term counts # Trains the LDA model. # The input to LDA must be a dataframe containing a "features" column # (e.g. 10 topics and 100 iterations: k=10, maxIter=100) #lda = None lda = LDA(featuresCol=column, topicDistributionCol='_'+column, k=5, maxIter=20) model = lda.fit(df) ''' # compute likelihood and perplexity metrics ll = model.logLikelihood(df) lp = model.logPerplexity(df) print("The lower bound on the log likelihood: " + str(ll)) print("The upper bound bound on perplexity: " + str(lp)) #''' # Describe topics (using the 3 first terms) topics = model.describeTopics(3) #print("The topics described by their top-weighted terms:") #topics.show(truncate=False) # Shows the result df = model.transform(df) #df.show(truncate=False) df = replace(df, column, '_'+column) return (df, topics.collect(), voc)
def train(df,hiperparameter): ''' LDA training, returning LDA model. input: - Dataframe - config (configurasi hiperparameter) return: kmeans model ''' lda = LDA(featuresCol = hiperparameter['featuresCol'], maxIter = hiperparameter['maxIter'], seed = hiperparameter['seed'], checkpointInterval = hiperparameter['checkpointInterval'], k = hiperparameter['k'], optimizer = hiperparameter['optimizer'], learningOffset = hiperparameter['learningOffset'], learningDecay = hiperparameter['learningDecay'], subsamplingRate = hiperparameter['subsamplingRate'], optimizeDocConcentration = hiperparameter['optimizeDocConcentration'], # docConcentration = hiperparameter['docConcentration'], topicConcentration = hiperparameter['topicConcentration'], topicDistributionCol = hiperparameter['topicDistributionCol'], keepLastCheckpoint = hiperparameter['keepLastCheckpoint']) model = lda.fit(df) return model
def feature_engineering(class_balancedDf): # N-Gram ngram = NGram(n=2, inputCol="lemmatized", outputCol="ngrams") ngramDataFrame = ngram.transform(class_balancedDf) # Hashing TF hashingTF = HashingTF(inputCol="ngrams", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(ngramDataFrame) # IDF idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) # K-Means kmeans = KMeans().setK(6).setSeed(1) kmodel = kmeans.fit(rescaledData).transform(rescaledData) #LDA lda = LDA(k=10, maxIter=10) ldamodel = lda.fit(kmodel).transform(kmodel) # changing label column to int data = ldamodel.withColumn( "label", ldamodel.label.cast("Integer")).drop("prediction") return data
def perform_lda(documents, n_topics, n_words, beta, tokens_col): ''' will perform LDA on a list of documents (== list of token) assume that documents is a DataFrame with a column of unique id (uid). ''' cv = CountVectorizer(inputCol=tokens_col, outputCol="raw_features") cvmodel = cv.fit(documents) result_cv = cvmodel.transform(documents) #we perform an tf-idf (term frequency inverse document frequency), to avoid threads with a lot of words to pollute the topics. idf = IDF(inputCol="raw_features", outputCol="features") idfModel = idf.fit(result_cv) result_tfidf = idfModel.transform(result_cv) #keeping created for time series purpose. corpus = result_tfidf.select("uid", "features")#, "date") lda = LDA(k=n_topics, topicConcentration=beta) model = lda.fit(corpus) #retrieving topics, and the vocabulary constructed by the CountVectorizer topics = model.describeTopics(maxTermsPerTopic=n_words) vocab = cvmodel.vocabulary #getting topic distribution per document. #topic_distribution = model.transform(corpus)[['topicDistribution', 'date']] #the topics are just numerical indices, we need to convert them to words, and associate them to their weights.. topics_with_weights = topics.rdd.map(lambda r: (r[0], ([(vocab[t],w) for t,w in zip(r[1], r[2])]), ' '.join([vocab[t] for t in r[1]]))).toDF().selectExpr("_1 as topic_number", "_2 as topic_weight", "_3 as topic") return topics_with_weights#, topic_distribution
def do_lda_with_count_vectorizer(self, k, rescaled_data, vocab): lda = LDA(k=k, seed=1, maxIter=100, optimizer="em", featuresCol="features", topicConcentration=5) lda_model = lda.fit(rescaled_data) transformed_df = lda_model.transform(rescaled_data).select("url", "topicDistribution") topics_description = lda_model.describeTopics().rdd\ .map(lambda row: row['termIndices'])\ .map(lambda idx_list: [vocab[idx] for idx in idx_list]) \ .collect() return transformed_df, topics_description
def LDA_pipefit (data_ip, ipcol): text_col = ipcol from sparknlp.base import DocumentAssembler documentAssembler = DocumentAssembler().setInputCol(text_col).setOutputCol('document') from sparknlp.annotator import Tokenizer tokenizer = Tokenizer().setInputCols(['document']).setOutputCol('tokenized') from sparknlp.annotator import Normalizer normalizer = Normalizer().setInputCols(['tokenized']).setOutputCol('normalized').setLowercase(True) from sparknlp.annotator import LemmatizerModel lemmatizer = LemmatizerModel.pretrained().setInputCols(['normalized']).setOutputCol('lemmatized') from sparknlp.annotator import StopWordsCleaner stopwords_cleaner = StopWordsCleaner().setInputCols(['lemmatized']).setOutputCol('unigrams').setStopWords(eng_stopwords) from sparknlp.annotator import NGramGenerator ngrammer = NGramGenerator().setInputCols(['lemmatized']).setOutputCol('ngrams').setN(3).setEnableCumulative(True).setDelimiter('_') from sparknlp.annotator import PerceptronModel pos_tagger = PerceptronModel.pretrained('pos_anc').setInputCols(['document', 'lemmatized']).setOutputCol('pos') from sparknlp.base import Finisher finisher = Finisher().setInputCols(['unigrams', 'ngrams','pos']) from pyspark.ml import Pipeline pipeline = Pipeline().setStages([documentAssembler, tokenizer, normalizer, lemmatizer, stopwords_cleaner, pos_tagger, ngrammer, finisher]) review_text_clean = ipcol processed_tweets = pipeline.fit(data_ip).transform(data_ip) from pyspark.sql.functions import concat processed_tweets = processed_tweets.withColumn('final',concat(F.col('finished_unigrams'), F.col('finished_ngrams'))) from pyspark.ml.feature import CountVectorizer tfizer = CountVectorizer(inputCol='final',outputCol='tf_features') tf_model = tfizer.fit(processed_tweets) tf_result = tf_model.transform(processed_tweets) from pyspark.ml.feature import IDF idfizer = IDF(inputCol='tf_features', outputCol='tf_idf_features') idf_model = idfizer.fit(tf_result) tfidf_result = idf_model.transform(tf_result) from pyspark.ml.clustering import LDA num_topics = 3 max_iter = 10 lda = LDA(k=num_topics, maxIter=max_iter, featuresCol='tf_idf_features') lda_model = lda.fit(tfidf_result) from pyspark.sql import types as T vocab = tf_model.vocabulary def get_words(token_list): return [vocab[token_id] for token_id in token_list] udf_to_words = F.udf(get_words, T.ArrayType(T.StringType())) num_top_words = 15 topics = lda_model.describeTopics(num_top_words).withColumn('topicWords', udf_to_words(F.col('termIndices'))) topics_p=topics.toPandas() return topics_p
def main(): spark.sql("CLEAR CACHE") business = spark.read.parquet("yelp-etl/business_etl").repartition(8) business.createOrReplaceTempView("business") review = spark.read.parquet("yelp-etl/review_etl").repartition(16)#.cache() review.createOrReplaceTempView("review") ## Location based reviews # spark.sql("SELECT b.state, COUNT(*) AS bus_rev_count FROM business b INNER JOIN review r ON b.business_id = r.business_id GROUP BY b.state ORDER BY bus_rev_count DESC").show() # # ## Choosing reviews from Ontario(state = "ON") on_bus_rev = spark.sql("SELECT r.review_id, b.business_id, r.text, r.label FROM business b INNER JOIN review r ON b.business_id = r.business_id WHERE b.state = 'ON' AND r.label = 0") ## Remove punctuations and spaces punct_remover = functions.udf(lambda x: remove_punct(x)) review_df = on_bus_rev.select('review_id', 'business_id', punct_remover('text')).withColumnRenamed('<lambda>(text)', 'text') ## Tokenize tok = Tokenizer(inputCol="text", outputCol="words") ## Remove stop words stopwordList = ['','i','get','got','also','really','would','one','good','like','great','tri','love','two','three','took','awesome','me','bad','horrible','disgusting','terrible','fabulous','amazing','terrific','worst','best','fine','excellent','acceptable','my','exceptional','satisfactory','satisfying','super','awful','atrocious','unacceptable','poor','sad','gross','authentic','myself','cheap','expensive','we','our','ours','ourselves','you','your','yours','yourself','yourselves', 'he', 'him', 'his', 'himself','she','her','hers','herself','it','its','itself','they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then','once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each','few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn','weren', 'won', 'wouldn'] stopword_rm = StopWordsRemover(inputCol="words", outputCol="words_nsw", stopWords=stopwordList) pipestages = [tok,stopword_rm] pipeline = Pipeline(stages = pipestages) model = pipeline.fit(review_df) tokenized_df = model.transform(review_df) ## Lemmatizing lemmatize_udf = functions.udf(lambda x: lemmatize(x), types.ArrayType(types.StringType())) lemmatized_df = tokenized_df.withColumn("lemmatized",lemmatize_udf("words_nsw")).select("review_id","business_id","lemmatized") # ## Stemming # stemmer_udf = functions.udf(lambda x: stem(x), types.ArrayType(types.StringType())) # stemmed_df = lemmatized_df.withColumn("stemmed", stemmer_udf("lemmatized")).drop(lemmatized_df["lemmatized"]) ## Count Vectorizer cv = CountVectorizer(inputCol="lemmatized", outputCol="vectors") cv_model = cv.fit(lemmatized_df) cv_df = cv_model.transform(lemmatized_df).drop(lemmatized_df["lemmatized"]) cv_model.save("topic_modelling/cvmodel_neg") idf = IDF(inputCol="vectors",outputCol="tfidf") idf_model = idf.fit(cv_df) result = idf_model.transform(cv_df) result = result.select("review_id","business_id","tfidf") lda = LDA(featuresCol='tfidf', k=5, seed=42, maxIter=50) model = lda.fit(result) model.write().overwrite().save("topic_modelling/ldamodel_neg") transformed = model.transform(result) transformed.write.parquet("topic_modelling/review_topics_neg",mode="overwrite") spark.stop()
def clusteredData(self, dataset, cvModel): lda = LDA(k=20, seed=123, optimizer="em", featuresCol="features") ldamodel = lda.fit(dataset) # model.isDistributed() # model.vocabSize() ldaTopics = ldamodel.describeTopics() self.getTheMapping(ldaTopics, cvModel) '''
def lda_train(result_tfidf): from pyspark.ml.linalg import Vectors, SparseVector from pyspark.ml.clustering import LDA # lda = LDA(k=10, seed=1, optimizer="em") lda.setMaxIter(100) # model = lda.fit(result_tfidf[['index', 'features']]) # model = LDA.train(result_tfidf[['index', 'features']].rdd.map(list), k=num_topics, maxIterations=max_iterations) return model
def main(): spark = SparkSession \ .builder \ .appName("Reddit Site:Get Data") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() file="file:////l2/corpora/reddit/submissions/RS_2015-12.bz2" output=file[-14:-3] sc = spark.sparkContext print('\n\n\n starting read and filter') df = filterPosts(file,sc,spark) df= convertToVec(df, sc, spark, output, inputCol='tokens') num_topics=10 print('\n\n\n LDA... \n\n\n') newLDA=False if newLDA: lda=LDA(featuresCol='vectors', k=num_topics, maxIter=50) lda_model=lda.fit(df.select('id','vectors')) lda_model.save(output+'_ldamodel') else: lda_model=LocalLDAModel.load(output+'_ldamodel') print('\n\n\n Describe Topics... \n\n\n') topic_indices=lda_model.describeTopics(maxTermsPerTopic=50) topic_indices.write.json(output+'_topics.json', mode='overwrite') print('\n\n\n reduce to subs\n\n\n') #subDF=df.select('subreddit','vectors').groupBy(df.subreddit).sum('vectors') subDF=df.select('subreddit','vectors').rdd.mapValues(lambda v: v.toArray()) \ .reduceByKey(lambda x, y: x + y) \ .mapValues(lambda x: DenseVector(x)) \ .toDF(["subreddit", "vectors"]) ''' print('\n\n\n LDA... \n\n\n') lda=LDA(featuresCol='vectors', k=num_topics, maxIter=50) lda_model=lda.fit(subDF.select('subreddit','vectors')) print('\n\n\n Describe Topics... \n\n\n') topic_indices=lda_model.describeTopics(maxTermsPerTopic=50) topic_indices.write.json(output+'_topics.json', mode='overwrite') ''' print('\n\n\n Transform DataSet \n\n\n') subDF=lda_model.transform(subDF).drop('vectors') #topicDF=lda_model.transform(vecDF) subDF.write.json(output+'_transformed.json', mode='overwrite')
def lda_train(): # Loads data. dataset = spark.read.format("libsvm").load("train.libsvm", numFeatures=4758484) # Trains a LDA model. lda = LDA(k=20, maxIter=200) model = lda.fit(dataset) ll = model.logLikelihood(dataset) lp = model.logPerplexity(dataset) print("The lower bound on the log likelihood of the entire corpus: " + str(ll)) print("The upper bound on perplexity: " + str(lp)) # Describe topics. topics = model.describeTopics(3) print("The topics described by their top-weighted terms:") topics.show(truncate=False) topics = model.describeTopics() topics_array = topics.select('termWeights').collect() topics_array = np.array([i[0] for i in topics_array]) # Shows the result transformed = model.transform(dataset) transformed.show(truncate=False) user_vector = transformed.select('topicDistribution').collect() with open('idx.pickle', 'rb') as f: idx_item, item_idx, idx_user, user_idx, label, train = pickle.load(f) user_test = list(user_idx.keys()) submit = [] user_test_idx = [] for uid in user_test: user_test_idx.append(user_idx.get(uid)) for i in user_test_idx: item_rec = [idx_user[i]] user_vector[i] = i[0].toArray() sim = i.dot(topics_array) / (np.linalg.norm(i) * np.linalg.norm(topics_array)) sim = np.argsort(-sim).tolist() [item_rec.append(idx_item[i]) for i in sim] submit.append(item_rec) df = pd.DataFrame(submit) df.to_csv('submit.csv', header=None, index=None) # Save model.save('lda.model') # Stop spark.stop()
def lda_model_score(df, num_topics): """ LDA pipeline: train LDA, extract topics, predict topic for each data point -- input : df -> dataframe of reviews, num_topics -> int output : lda -> spark lda with initiated parameters, model -> trained spark lda, topics -> identified clustered topics, transformed -> reviews with topic predicted """ lda = LDA( k=num_topics, optimizer="em") # call spark LDA, initialized by number of topics model = lda.fit(df) # fit model topics = model.describeTopics(maxTermsPerTopic=30).collect( ) # "describe" the topics by topic vocabulary determined by LDA transformed = model.transform(df) # generate predict return lda, model, topics, transformed
def run_ml_pipeline(nlpPipelineDF, num_topics, max_iterations, vocabSize, minDF, maxDF): """Define a Spark LDA topic modelling pipeline""" cv = CountVectorizer( inputCol="allTokens", outputCol="features", vocabSize=vocabSize, minDF=minDF, maxDF=maxDF, minTF=1.0, ) idf = IDF(inputCol="features", outputCol="idf") lda = LDA( k=num_topics, maxIter=max_iterations, optimizer="online", seed=1, learningOffset= 100.0, # If high, early iterations are downweighted during training learningDecay= 0.51, # Set between [0.5, 1) to guarantee asymptotic convergence ) mlPipeline = Pipeline(stages=[cv, idf, lda]) mlModel = mlPipeline.fit(nlpPipelineDF) ldaModel = mlModel.stages[2] return mlModel, ldaModel
def UsefulnessPredictionLDA(trainingdata, model): # Data Preprocessing tokenizer = Tokenizer(inputCol="review_text", outputCol="tokens_word") remover = StopWordsRemover(inputCol="tokens_word", outputCol="filtered_tokens_word") cv = CountVectorizer(inputCol="filtered_tokens_word", outputCol="raw_features", minDF=2.0) idf = IDF(inputCol="raw_features", outputCol="features") # Extract LDA topic feature lda = LDA(k=30, maxIter=10) if model == 'RandomForest': model = RandomForestRegressor(featuresCol="topicDistribution") pipeline = Pipeline(stages=[tokenizer, remover, cv, idf, lda, model]) evaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") paramGrid = ParamGridBuilder() \ .addGrid(cv.vocabSize, [150, 200, 250]) \ .build() crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator_rmse, numFolds=4) # use 3+ folds in practice cvModel = crossval.fit(trainingdata) # Explain params for the selected model print cvModel.explainParams() return cvModel
def UsefulnessPredictionLDAWithoutCV(trainingdata, model): # Data Preprocessing tokenizer = Tokenizer(inputCol="review_text", outputCol="tokens_word") remover = StopWordsRemover(inputCol="tokens_word", outputCol="filtered_tokens_word") cv = CountVectorizer(inputCol="filtered_tokens_word", outputCol="raw_features", minDF=2.0, vocabSize=250) idf = IDF(inputCol="raw_features", outputCol="features") # Extract LDA topic feature lda = LDA(k=30, maxIter=10) if model == 'RandomForest': model = RandomForestRegressor(featuresCol="topicDistribution") pipeline = Pipeline(stages=[tokenizer, remover, cv, idf, lda, model]) evaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") cvModel = pipeline.fit(trainingdata) # Explain params for the selected model print cvModel.explainParams() return cvModel
def main(): subreddit_group = spark.read.parquet(input_file).repartition(2000) # subreddit_group.show() #hashing = HashingTF(inputCol="comments", outputCol="features") count_vectorizer = CountVectorizer(inputCol="comments", outputCol="features") lda = LDA(k=10, maxIter=10, optimizer='online') pipeline = Pipeline(stages=[count_vectorizer, lda]) model = pipeline.fit(subreddit_group) predictions = model.transform(subreddit_group).selectExpr( 'id', 'topicDistribution') change_to_str = F.udf(to_text) topics_df = predictions.select( predictions['id'], change_to_str( predictions['topicDistribution']).alias('topicDistribution')) #topics_df.show(20, False) topics_df.write.option('sep', ',').save(output, format='csv', mode='overwrite')
def training_model(train, k=10, maxiter=120, features_name="features", optimizer_type="online", seed=123): lda = LDA(k=k, seed=123, optimizer=optimizer_type, featuresCol=features_name, subsamplingRate=0.1, learningDecay=0.5, optimizeDocConcentration=True, maxIter=maxiter) ldamodel = lda.fit(train) predictionTrain = ldamodel.transform(train) return (ldamodel, predictionTrain)
def main(): comments = spark.read.json(input_comments, schema=comments_schema).repartition(100) comm = comments.select(comments['subreddit'].alias('id'), comments['body']).limit(50) preprocess = F.udf(clean_data, returnType=types.ArrayType(types.StringType())) comm_split = comm.select(comm['id'], F.split(comm['body'], ' ').alias('comments')) sub_group = comm_split.groupBy(comm_split['id']).agg(F.collect_list('comments').alias('comments')) \ .select(F.col('id'), F.col('comments')) comm_lemm = sub_group.select( sub_group['id'], preprocess(sub_group['comments']).alias('comments')).cache() # hashing_model = HashingTF(inputCol="comments", outputCol="features") # result = hashing_model.transform(comm_lemm) cv = CountVectorizer(inputCol="comments", outputCol="features") count_vectorizer_model = cv.fit(comm_lemm) result = count_vectorizer_model.transform(comm_lemm) result.show(truncate=False) # # vocabArray = count_vectorizer_model.vocabulary # print(vocabArray) corpus = result.select(result['id'], result['features']).cache() lda = LDA(k=5, optimizer='online') lda_model = lda.fit(corpus) transformed = lda_model.transform(corpus) transformed.show(truncate=False) topic_text = F.udf(to_text) topics_df = transformed.select( transformed['id'], topic_text( transformed['topicDistribution']).alias('topicDistribution')) #topics_df.show(truncate=False) topics_df.write.option('sep', ',').save(output_file, format='csv', mode='overwrite')
def cv_idf_lda(df): ''' CountVectorizer, IDF, LDA :param df: :return: ''' pass # CountVectorizer cv = CountVectorizer(inputCol="content", outputCol="features") model_cv = cv.fit(df) vocabulary = model_cv.vocabulary df_cv = model_cv.transform(df) df_cv.cache() # # IDF idf = IDF(inputCol="features", outputCol="cv") model_idf = idf.fit(df_cv) df_idf = model_idf.transform(df_cv) df_idf.cache() def getwords(row): ''' 根据下标,映射得到对应词 :param row: :return: ''' words = list() for index in row[3]: words.append(vocabulary[index]) return [row[0], row[1], row[2], words] # LDA lda = LDA(k=10, seed=1, optimizer="em") model_lda = lda.fit(df_idf) print(model_lda.describeTopics(maxTermsPerTopic=10)) # model_lda.save('file:///home/pxz/model_lda/lda15') # print(lda[df_idf]) # print(type(lda[df_idf])) # # 主题数maxTermsPerTopic df_des = model_lda.describeTopics(maxTermsPerTopic=15) rdd_des = df_des.select("topic", "termIndices", "termWeights", df_des.termIndices).rdd.map(getwords) df_des = sparkEntrance.spark.createDataFrame(rdd_des, ['topic', 'termIndices', 'termWeights', 'words']) # return df_des df_des.select('topic', 'words', 'termWeights').show(truncate=False)
def lda_model(data): lda = LDA( k=LDA_CLUSTERS, # seed=123, # optimizer="em", featuresCol="vectors") # todo Gridsearch best parameters model = lda.fit(data) topics = model.describeTopics(maxTermsPerTopic=15) log.info("Learned topics (as distributions over vocab of " + str(model.vocabSize()) + " words):") wordNumbers = 10 topicIndices = model.describeTopics(maxTermsPerTopic=wordNumbers) topicIndices.show() #does not work as it shown in dics. Seems to be in process in current Python API show_lda_weights(model, topics)
def trainModel(docMatrix, savemodel, k, iterations=10, parallelization=16): data = mmread(docMatrix) rowRange = sc.parallelize(xrange(data.shape[0]), parallelization) dataSpark = spark.createDataFrame( rowRange.map(lambda i: Row( label=i, features=sparkToScipySparse(data.getrow(i))))) lda = LDA(k=k, maxIter=iterations) model = lda.fit(dataSpark) model.save(savemodel) topicMatrix = model.topicsMatrix().toArray() topicMatrix = topicMatrix.T topicMatrix = topicMatrix / topicMatrix.sum(axis=0) print 'TODO: give wordXtopic.mtx a path' mmwrite('wordXtopic.mtx', topicMatrix) print 'TODO: give docXtopic.mtx a path' docXTopics = model.transform(dataSpark) dxT = docXTopics.collect() dxT_v2 = np.array([dxtI['topicDistribution'] for dxtI in dxT]) mmwrite('docXtopic.mtx', dxT_v2)
def set_lda_model(params: Dict[str, Any]): lda = LDA( k=params['topics'], maxIter=params['iter'], optimizer="online", seed=1, learningOffset= 100.0, # If high, early iterations are downweighted during training learningDecay= 0.51, # Set between [0.5, 1) to guarantee asymptotic convergence ) return lda
def content_recom(self, file1, file2, tfidf_model, tfidf_lda_model, sentiment_file, all_business_parquet, key_words, num_results=20): from pyspark import SparkContext from pyspark.sql import SparkSession sparkconf_builder = spark_celery_app.sparkconf_builder spark_conf = sparkconf_builder() sc = SparkContext.getOrCreate(conf=spark_conf) spark = SparkSession.builder.config(conf=spark_conf).getOrCreate() data = spark.read.json(file1) df_business = spark.read.parquet(file2) df = data.select('business_id', 'text') review_rdd = df.rdd.map(tuple).reduceByKey(operator.add) review_df = spark.createDataFrame(review_rdd).withColumnRenamed( '_1', 'business_id').withColumnRenamed('_2', 'text') tfidf_model = PipelineModel.load(tfidf_model) result_tfidf = tfidf_model.transform(review_df) yelp = result_tfidf lda = LDA(k=15, maxIter=100) model = LocalLDAModel.load(tfidf_lda_model) # lda output column topicDistribution lda_df = model.transform(yelp) lda_vec = lda_df.select( 'business_id', 'topicDistribution').rdd.map(lambda x: (x[0], x[1])).collect() result = get_keywords_recoms(key_words, num_results, tfidf_model, model, lda_vec) df_sentiment = spark.read.json(sentiment_file) df_content_rest = df_sentiment.join( result, 'business_id', 'inner').orderBy("sentiment_score", ascending=False).limit(num_results) all_busi_df = spark.read.parquet(all_business_parquet) df_rest_result = all_busi_df.join(df_content_rest, 'business_id', 'right').select('business_id', 'sentiment_score', 'name', 'categories', 'score', 'latitude', 'longitude') df_rest_result.show() collected_df_rest_result = df_rest_result.collect() return collected_df_rest_result
def main(): sys.stdout = codecs.getwriter('utf8')(sys.stdout) sys.stderr = codecs.getwriter('utf8')(sys.stderr) spark = SparkSession.builder.appName("LDA Batch Model").getOrCreate() sc = spark.sparkContext print AWS_ACCESS_KEY_ID print AWS_SECRET_ACCESS_KEY sc._jsc.hadoopConfiguration().set( "fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") sc._jsc.hadoopConfiguration().set('fs.s3a.access.key', AWS_ACCESS_KEY_ID) sc._jsc.hadoopConfiguration().set('fs.s3a.secret.key', AWS_SECRET_ACCESS_KEY) custom_stop_words = utils.load_stop_words(sc) texts_df = utils.load_texts(spark) pipeline = ml_utils.set_pipeline(custom_stop_words) model = pipeline.fit(texts_df) result = model.transform(texts_df) # Cluster the documents into three topics using LDA lda = LDA(k=NUMBER_OF_TOPICS, maxIter=5, featuresCol="vectors") lda_model = lda.fit(result) # Describe topics topics = lda_model.describeTopics(3) print("The topics described by their top-weighted terms:") topics.show(truncate=False) # Shows the result transformed = lda_model.transform(result) transformed.show(truncate=False) # Save and load model lda_model.save("s3a://current-models/LDAModel") sc.stop()
def modelData(self, corp): print("Data modeling") #Cluster the data into n topics using LDA ldaModel = None if (self.persistSteps and not self.recompute and os.path.isdir(self.stepsPath + "ldaModel")): print("Model exist, loading") ldaModel = LocalLDAModel.load(self.stepsPath + "ldaModel") else: print("Creating Model") lda = LDA(k=self.kTopics, maxIter=100, optimizer='online') ldaModel = lda.fit(corp) if (self.persistSteps): print("Saving model") if (os.path.isdir(self.stepsPath + "ldaModel")): shutil.rmtree(self.stepsPath + "ldaModel") ldaModel.save(self.stepsPath + "ldaModel") print("Extracting Topics") self.topicIndices = ldaModel.describeTopics(maxTermsPerTopic=5) if (self.persistSteps and not self.recompute and os.path.isdir(self.stepsPath + "predictions")): print("Predictions exist, loading") self.predictions = self.spark.read.load(self.stepsPath + "predictions") else: print("Predicting Data") self.predictions = ldaModel.transform(corp) self.topics = ldaModel.topicsMatrix() if (self.persistSteps): print("Saving predictions data") if (os.path.isdir(self.stepsPath + "predictions")): shutil.rmtree(self.stepsPath + "predictions") self.predictions.select( "label", "features", "topicDistribution").write.save(self.stepsPath + "predictions")
def main(): spark = SparkSession.builder \ .appName("Spark CV-job ad matching") \ .config("spark.some.config.option", "some-value") \ .master("local[*]") \ .getOrCreate() VOCAB_SIZE = 100 MIN_DF = 1.0 TOPIC_NUM = 50 df_jobs = spark.read.json("alljobs4rdd/alljobs.jsonl").filter( "description is not NULL") tokenizer = Tokenizer(inputCol="description", outputCol="words") tokenized = tokenizer.transform(df_jobs) remover = StopWordsRemover(inputCol="words", outputCol="filtered") removed = remover.transform(tokenized) processed = removed.rdd.map(lambda row: ( row.jobId, lemmatize(strip_punctuation(row.filtered)))).toDF( ["jobid", "processed"]) countVectorizer = CountVectorizer(inputCol="processed", outputCol="rawFeatures", vocabSize=VOCAB_SIZE, minDF=MIN_DF, binary=False) cv_model = countVectorizer.fit(processed) featurizedData = cv_model.transform(processed) lda = LDA(k=TOPIC_NUM, seed=4314, optimizer="em") lda.setFeaturesCol("rawFeatures") model = lda.fit(featurizedData) vocab = cv_model.vocabulary model.describeTopics().rdd.map(lambda row: (row.topic, [vocab[x] for x in row.termIndices])).toDF(["Topic", "words"])\ .coalesce(1).rdd.saveAsTextFile('lda-topics-lemmatized')
from pyspark.ml.feature import Tokenizer, CountVectorizer tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut") tokenized = tkn.transform(sales.drop("features")) cv = CountVectorizer()\ .setInputCol("DescOut")\ .setOutputCol("features")\ .setVocabSize(500)\ .setMinTF(0)\ .setMinDF(0)\ .setBinary(True) cvFitted = cv.fit(tokenized) prepped = cvFitted.transform(tokenized) # COMMAND ---------- from pyspark.ml.clustering import LDA lda = LDA().setK(10).setMaxIter(5) print lda.explainParams() model = lda.fit(prepped) # COMMAND ---------- model.describeTopics(3).show() cvFitted.vocabulary # COMMAND ----------
Run with: bin/spark-submit examples/src/main/python/ml/lda_example.py """ if __name__ == "__main__": spark = SparkSession \ .builder \ .appName("LDAExample") \ .getOrCreate() # $example on$ # Loads items. dataset = spark.read.format("libsvm").load("items/mllib/sample_lda_libsvm_data.txt") # Trains a LDA model. lda = LDA(k=10, maxIter=10) model = lda.fit(dataset) ll = model.logLikelihood(dataset) lp = model.logPerplexity(dataset) print("The lower bound on the log likelihood of the entire corpus: " + str(ll)) print("The upper bound bound on perplexity: " + str(lp)) # Describe topics. topics = model.describeTopics(3) print("The topics described by their top-weighted terms:") topics.show(truncate=False) # Shows the result transformed = model.transform(dataset) transformed.show(truncate=False)
featurizedData = cvmodel.transform(clean_text) vocab = cvmodel.vocabulary vocab_broadcast = sc.broadcast(vocab) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) ################################################################################################ # # LDA Clustering - Find Data-driven Topics # ################################################################################################ lda = LDA(k=25, seed=123, optimizer="em", featuresCol="features") ldamodel = lda.fit(rescaledData) #model.isDistributed() #model.vocabSize() ldatopics = ldamodel.describeTopics() ldatopics.show(25) def map_termID_to_Word(termIndices): words = [] for termID in termIndices: words.append(vocab_broadcast.value[termID]) return words
def trainModel(self): logger.info("Training the model...") query = '''select page_id, max(page_title) as page_title from cooladata where date_range(all) and page_id is not null group by page_id;''' def SQLtoURL(query): data = query.replace('\n', ' ').replace('\t',' ').replace(' ',' ').replace(' ',' ') return data def QueryXXXXX(query, file = None): session = Session() response = session.post(data = {'tq': query,}, url = 'https://app.XXXXXX.com/api/v2/projects/115659/cql/', headers = {'Authorization': 'Token dtQvPVejNcSebX1EkU0AqB2TJRXznIgZiDvDu3HR'},) return response.content table = json.loads(codecs.decode(QueryCoola(SQLtoURL(query)),'utf-8'))['table'] title_list = [x['c'] for x in table['rows']] table_cols = [d['label'] for d in table['cols']] def convert_row(row): rowlist = [d['v'] for d in row] return rowlist rd = self.sc.parallelize(title_list).map(convert_row) titleData = self.spark.createDataFrame(rd, table_cols) titleData = titleData.dropna() hebrew_stopwords = stop_words() def rmv(words): for punc in punctuation: words = words.replace(punc,"") for hword in hebrew_stopwords: words = words.replace(hword, " ") return words self.spark.udf.register("rmv", rmv, StringType()) titleData.registerTempTable("wordstable") cleanedSentenceData = self.spark.sql("select page_id, page_title, rmv(page_title) as cleanedSentence from wordstable") tokenizer = Tokenizer(inputCol="cleanedSentence", outputCol="words") wordsData = tokenizer.transform(cleanedSentenceData) cv = CountVectorizer(inputCol="words", outputCol="rawFeatures", minDF = 2.0) cvModel = cv.fit(wordsData) featurizedData = cvModel.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) lda = LDA(k=100) ldaModel = lda.fit(rescaledData) postFactorizedData = ldaModel.transform(rescaledData) norm = Normalizer(inputCol = "topicDistribution", outputCol="normTopicDist") scaledFactorizedNormalizedData = norm.transform(postFactorizedData) self.model = scaledFactorizedNormalizedData logger.info("model is built!")