corpus = [dictionary.doc2bow(text) for text in texts] # COMMAND ---------- data = to_list[0:100] # COMMAND ---------- data # COMMAND ---------- # Trains a LDA model. lda = LDA(k=10, maxIter=10) model = lda.fit(dataset) ll = model.logLikelihood(dataset) lp = model.logPerplexity(dataset) print("The lower bound on the log likelihood of the entire corpus: " + str(ll)) print("The upper bound on perplexity: " + str(lp)) # Describe topics. topics = model.describeTopics(3) print("The topics described by their top-weighted terms:") topics.show(truncate=False) # Shows the result transformed = model.transform(dataset) transformed.show(truncate=False)
df_comments = sqlContext.createDataFrame(comments, ["list_of_words", 'index']) # TF cv = CountVectorizer(inputCol="list_of_words", outputCol="raw_features", vocabSize=50000, minDF=10.0) cvmodel = cv.fit(df_comments) result_cv = cvmodel.transform(df_comments) # IDF idf = IDF(inputCol="raw_features", outputCol="features") idfModel = idf.fit(result_cv) result_tfidf = idfModel.transform(result_cv) lda = LDA(k=3, maxIter=50) model = lda.fit(result_tfidf[['index', 'features']]) transformed = model.transform(result_tfidf) # transformed.show(truncate=False) model.describeTopics(8).show() # ll = model.logLikelihood(result_tfidf[['index','features']]) # lp = model.logPerplexity(result_tfidf[['index','features']]) vocabulary = {} j = 0 for i in cvmodel.vocabulary: vocabulary[j] = i.encode("utf-8") j += 1
p = rescaledData.select('features') p = p.limit(650000) # you can choose number or comments you want to run LDA on #p.count() #p.show(3) import threading import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO, filename='running.log', filemode='w') #Calculating LDA: start = time() lda = LDA(k=20, maxIter=500) model = lda.fit(p) print('used LDA: {:.2f}s'.format(time() - start)) #model.isDistributed() #start = time() #ll = model.logLikelihood(p) #lp = model.logPerplexity(p) #print("The lower bound on the log likelihood of the entire corpus: " + str(ll)) #print("The upper bound on perplexity: " + str(lp)) #print ('used: {:.2f}s'.format(time()-start)) start = time() # Describe topics. topics = model.describeTopics(15)