tweet_texts = sqlContext.sql("SELECT text FROM tweets") # run the processing .run() in the processing.py for the texts # as output, we have pre-processed texts ready for gensim dictionary and gensim building texts = twpr.run(tweet_texts) # build Gensim dictionary and corpus with helper methods in processing.py dictionary = twpr.buildDictionaryFromTexts(texts) corpus = twpr.buildCorpusFromDictionaryAndTexts(texts, dictionary) # set LDA topic count parameter num_topics = 25 # in order to map LDA output and actual tweets for further analysis, select tweet IDs and texts tweet_ids = sqlContext.sql("SELECT id_str as id, text FROM tweets") # now we have all necesssary pre-processed data for LDA analysis # use the pre-processed inputs to do the LDA analysis distros = twpr.doLDA(corpus, dictionary, num_topics, tweet_ids) # now we have the Apache Spark RDD object we can either .take(5) or .collect() all distros_all = distros.collect() # now we have the LDA topic probability distributions in memory hdp = twpr.doHDP(corpus, dictionary) # to make sense of the LDA output, we need to somehow look at the data # thus, we'll write the topics into CSV, weighted with TF-IDF frequencies topics = twpr.TFIDFsFromTopicDistributions(distros_all[0:-1], sqlContext, corpus, dictionary) twpr.writeTFIDFsToCSV(topics) # processing done
from gensim import corpora, models, similarities # needed for text clustering import sklearn import nltk import csv # nltk.download() # ensure all the necessary corpora are present for lemmatization # Use MongoDB to fetch the top 10 users with the highest ratio_per_tweet index users = twpr.User.objects() #.order_by('ratio_per_tweet').limit(100) # from those users, get the tweet texts tweet_texts = twpr.User.getTextsFromUsers(users) # and then pre-process those texts texts = twpr.runWithoutMap(tweet_texts) # build dictionary and corpus dictionary = twpr.buildDictionaryFromTexts(texts) corpus = twpr.buildCorpusFromDictionaryAndTexts(texts, dictionary) # set LDA topic count parameter num_topics = 25 # and do the LDA modeling lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, update_every=0, passes=20) # get the distributions for each tweet tweets_for_lda = twpr.User.getTweetsFromUsers(users) distros = twpr.distrosForTweetsFromLDAModel(lda, dictionary, tweets_for_lda) twpr.writeMongoDistrosIntoCSV(distros, num_topics, 'new_distros.csv') # print the topic keywords with the TF-IDF frequencies as weights topics = twpr.TFIDFsFromMongoDBTopicDistributions(distros, corpus, dictionary) twpr.writeTFIDFsToCSV(topics, 'new_tfidf.csv')