def LDA_Treatment(str): finalTopics = [] txt = wordTokenize(str) data = sc.parallelize([txt]).zipWithIndex().map(lambda val: Row(idd=val[1], _words=val[0].split(" "))) docDF = spark.createDataFrame(data, ["_words"]) Vector = CountVectorizer(inputCol="_words", outputCol="vectors") model = Vector.fit(docDF) result = model.transform(docDF) corpus = result.select("idd", "vectors").rdd.map(lambda val: [val[0], Vectors.fromML(val[1])]).cache() ldaModel = LDA.train(corpus, k=nbTopics, maxIterations=1000, optimizer='online') topics = ldaModel.topicsMatrix() vocabArray = model.vocabulary topicIndices = sc.parallelize(ldaModel.describeTopics(maxTermsPerTopic=wordNumbers)) topics_final = topicIndices.map(lambda topic: topic_render(topic, vocabArray)).collect() for topic in range(len(topics_final)): for term in topics_final[topic]: term = unidecode.unidecode(term) finalTopics.append(term) return finalTopics
# new_tweet= '' # for tok in tokens: # if tok not in stop_words or tok[0] != '@' or tok[0] != '#' or tok[:4] != 'http': # new_tweet += ' '+tok # new_tweet.strip() # tweets.append(new_tweet) # f.close() # fd = codecs.open('cleaned_example.txt', 'w', encoding = 'utf-8') # for tweet in tweets: # fd.write(tweet+'\n') rdd = sc.textFile('opinion.txt').zipWithIndex().map( lambda (words, idd): Row(idd=idd, words=words.split(" "))) docDF = spark.createDataFrame(rdd) Vector = CountVectorizer(inputCol="words", outputCol="vectors") model = Vector.fit(docDF) result = model.transform(docDF) corpus = result.select( "idd", "vectors").rdd.map(lambda (x, y): [x, Vectors.fromML(y)]).cache() # Cluster the documents into three topics using LDA ldaModel = LDA.train(corpus, k=5, maxIterations=100, optimizer='online') topics = ldaModel.topicsMatrix() vocabArray = model.vocabulary wordNumbers = 5 # number of words per topic topicIndices = sc.parallelize( ldaModel.describeTopics(maxTermsPerTopic=wordNumbers))
from pyspark.sql import SQLContext, Row from pyspark.ml.feature import CountVectorizer #http://spark.apache.org/docs/latest/mllib-clustering.html#latent-dirichlet-allocation-lda # Load and parse the data #data = to_list[0:1000] from pyspark.sql.types import StringType from pyspark.sql.functions import * corpus_df = spark.createDataFrame(data, StringType()) corpus_df = corpus_df.withColumn("index",monotonically_increasing_id()) corpus_df = corpus_df.withColumn("arrayColumn", array("value")) #data = sc.textFile(path).zipWithIndex().map(lambda (words,idd): Row(idd= idd, words = words.split(" "))) #docDF = spark.createDataFrame(data) Vector = CountVectorizer(inputCol="arrayColumn", outputCol="vectors") model = Vector.fit(corpus_df) result = model.transform(corpus_df) # Cluster the documents into three topics using LDA ldaModel = LDA.train(data, k=10) #ldaModel.save(sc, "target/org/apache/spark/PythonLatentDirichletAllocationExample/LDAModel") #sameModel = LDAModel\ # .load(sc, "target/org/apache/spark/PythonLatentDirichletAllocationExample/LDAModel") # COMMAND ---------- num_topics = 10 max_iterations = 100 lda_model = LDA.train(result[['index','vectors']], k=num_topics, maxIterations=max_iterations)
lambda (words, idd): Row(idd=idd, words=words.split(' '))) docDF = sqlContext.createDataFrame(parsedData) # docDF.show() # stopword remover remover = StopWordsRemover(inputCol="words", outputCol="filtered") remover.transform(docDF).show(truncate=False) Vector = CountVectorizer(inputCol="filtered", outputCol="vectors") pipeline = Pipeline(stages=[remover]) model_prep = pipeline.fit(docDF) result = model_prep.transform(docDF) model = Vector.fit(result) result = model.transform(result) result.show() corpus = result.select( "idd", "vectors").rdd.map(lambda (x, y): [x, Vectors.fromML(y)]).cache() # Cluster the documents into six topics using LDA ldaModel = LDA.train(corpus, k=6, maxIterations=100, optimizer='online') topics = ldaModel.topicsMatrix() vocabArray = model.vocabulary topicIndices = sc.parallelize(ldaModel.describeTopics(maxTermsPerTopic=20))