def execute(): print 'Started at ' + get_time() + '... ', start_timing() client.drop_database(TOPIC_TWEETS_DB_NAME) results = entity_results_coll.find(limit=NUMBER_OF_TOP_ENTITIES, no_cursor_timeout=True) \ .sort([(VALUE + '.' + COUNT, DESCENDING)]) for result in results: tweets = [] text = [] lower_entity = result[LOWER_ENTITY] entities = result[VALUE][PSEUDONYMS] entity_pseudos[lower_entity] = entities max_tweets = 0 for entity in entities: c = 0 for tweet in raw_collection.find({ENTITIES: entity}): c += 1 tweets.append(tweet) text.append(tweet[TWEET]) if c > max_tweets: actual_entity[lower_entity] = entity max_tweets = c text = clean(text) topic_id = get_topic_for_entity(text, tweets) entity_topic[lower_entity] = topic_id save_to_collection() save_model_data() print 'Finished' stop_timing()
def execute(): start_timing() print 'Starting Pre-processing for LDA...', documents = get_documents() tokenized_documents = clean(documents) dictionary = corpora.Dictionary([doc for doc in tokenized_documents]) dictionary.compactify() dictionary.save(DICTIONARY_PATH) corpus = [dictionary.doc2bow(doc) for doc in tokenized_documents] corpora.MmCorpus.serialize(CORPUS_PATH, corpus) print 'Finished' stop_timing() client.close()