def get_tweets_for_topic(topic, number_of_results=10000000, find_synonyms=True): #print 'Getting tweets for ' + topic['title'] s = solr.SolrConnection('http://localhost:8983/solr') # query the index start_date = topic['date']-timedelta(days=1) end_date = topic['date'] #+timedelta(days=1) search_query = sw_utils.sanitize_string_for_search(topic['title'], find_synonyms) print "FROM " + start_date.isoformat() + " TO " + end_date.isoformat() print search_query response = s.query('date:['+start_date.isoformat()+'Z TO ' + end_date.isoformat() +'Z] AND content_stemmed:' + search_query, rows=number_of_results) return response.results
def process_all_topics(): topics = parse_topics() i = 1 for topic in topics: tweets = get_tweets_for_topic(topic) text = join_tweet_texts(tweets) print strftime("%Y-%m-%d %H:%M:%S", gmtime()) + " Dividiendo texto en palabras..." text = nltk.word_tokenize(text) print strftime("%Y-%m-%d %H:%M:%S", gmtime()) + " Etiquetando texto..." tagged = nltk.pos_tag(text) tagged = filter_for_tags(tagged) tagged = normalize(tagged) unique_word_set = unique_everseen([x[0] for x in tagged]) graph = generate_digraph_from_unique_word_set(unique_word_set, tagged) generate_pagerank_graph = generate_pagerank_from_graph(graph) sorted_pagerank_graph = sort_pagerank_items(generate_pagerank_graph) most_representative_words = extract_most_representative_ranked_words(sorted_pagerank_graph) #Puntuando Tweets ranked_tweets = sort_tweets_by_rank(most_representative_words, tweets) search_query = sw_utils.sanitize_string_for_search(topic['title']) search_query_words = nltk.word_tokenize(search_query) words_to_expand = get_words_to_expand(most_representative_words, search_query_words) word_combinations = get_word_combinations(search_query_words, words_to_expand) add_new_tweets_to_tweets_list(ranked_tweets, topic, word_combinations) for tweet in ranked_tweets: print tweet[u'date'].isoformat() + str(tweet[u'rank']) + ' - ' + tweet[u'content'] start_date = topic['date']-timedelta(days=1) end_date = topic['date'] print 'From date: ' + start_date.isoformat() + ' to ' + end_date.isoformat() with open(ranked_tweets_path+str(i)+ranked_tweets_file+'.txt', "a") as f: for tweet in ranked_tweets: f.write(str(tweet[u'rank'])+','+tweet[u'date'].isoformat() + ',' + tweet[u'content'].encode('utf-8') + '\n') print tweet[u'date'].isoformat() + str(tweet[u'rank']) + ' - ' + tweet[u'content'] search_query = sw_utils.sanitize_string_for_search(topic['title']) print 'Tweets for: ' + search_query start_date = topic['date']-timedelta(days=1) end_date = topic['date'] with open(ranked_words_path+str(i)+ranked_words_file+'.txt', "a") as f: for word in most_representative_words: f.write(str(word[u'rank']) + ',' + word[u'word'].encode('utf-8') + '\n') print str(word[u'rank']) + " " + word[u'word'] i += 1
def process_english_tweet(tweet_content, tweet_date, tweet_id): tweet_stemmed = sw_utils.sanitize_string_for_search(tweet_content) #Solamente interesan aquellos tweets de las ultimas 24 horas y por tanto deben tener una fecha if tweet_date != 'NO_TIME': tweet_date = parser.parse(tweet_date) print str(tweet_date) + " " + tweet_stemmed