def execute(): print 'Started at ' + get_time() + '... ', start_timing() client.drop_database(TOPIC_TWEETS_DB_NAME) results = entity_results_coll.find(limit=NUMBER_OF_TOP_ENTITIES, no_cursor_timeout=True) \ .sort([(VALUE + '.' + COUNT, DESCENDING)]) for result in results: tweets = [] text = [] lower_entity = result[LOWER_ENTITY] entities = result[VALUE][PSEUDONYMS] entity_pseudos[lower_entity] = entities max_tweets = 0 for entity in entities: c = 0 for tweet in raw_collection.find({ENTITIES: entity}): c += 1 tweets.append(tweet) text.append(tweet[TWEET]) if c > max_tweets: actual_entity[lower_entity] = entity max_tweets = c text = clean(text) topic_id = get_topic_for_entity(text, tweets) entity_topic[lower_entity] = topic_id save_to_collection() save_model_data() print 'Finished' stop_timing()
def execute(): print 'Started at ' + get_time() + '... ', start_timing() hot_topics = [4,5,7, 8, 17] for topic in hot_topics: create_wordcloud(topic) print 'Finished' stop_timing()
def execute(): print 'Started LDA at ' + get_time() + '... ', start_timing() lda = models.LdaModel(CORPUS, id2word=DICTIONARY, num_topics=NUMBER_OF_TOPICS, passes=NUMBER_OF_PASSES, alpha=ALPHA) lda.save(LDA_PATH) print 'Finished' stop_timing()
if tweets_cnt == MAX_TWEETS_IN_FILE: tweets_cnt = 0 change_file() return True def on_error(self, status): print 'Error: ' + status if __name__ == '__main__': file_name = get_filename(FILE_PATH, file_number) tweets_file = open(file_name, WRITE) print "Started extracting tweets at " + get_time() + "... " while True: # ensures continuous stream extraction try: # This handles Twitter authentication and the connection to Twitter Streaming API l = StdOutListener() auth = OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET) stream = Stream(auth, l) stream.filter(languages=[ENGLISH], track=FILTER_KEYWORDS) except: # TODO continue