def get_tweets_for_topic(topic, number_of_results=10000000, find_synonyms=True):
    #print 'Getting tweets for ' + topic['title']
    s = solr.SolrConnection('http://localhost:8983/solr') # query the index


    start_date = topic['date']-timedelta(days=1)
    end_date = topic['date'] #+timedelta(days=1)

    search_query = sw_utils.sanitize_string_for_search(topic['title'], find_synonyms)

    print "FROM " + start_date.isoformat() + " TO " + end_date.isoformat()

    print search_query

    response = s.query('date:['+start_date.isoformat()+'Z TO ' +
                       end_date.isoformat() +'Z] AND content_stemmed:' +
                       search_query, rows=number_of_results)

    return response.results
def process_all_topics():

    topics = parse_topics()

    i = 1
    for topic in topics:

        tweets = get_tweets_for_topic(topic)

        text = join_tweet_texts(tweets)

        print strftime("%Y-%m-%d %H:%M:%S", gmtime()) + " Dividiendo texto en palabras..."
        text = nltk.word_tokenize(text)

        print strftime("%Y-%m-%d %H:%M:%S", gmtime()) + " Etiquetando texto..."
        tagged = nltk.pos_tag(text)

        tagged = filter_for_tags(tagged)
        tagged = normalize(tagged)

        unique_word_set = unique_everseen([x[0] for x in tagged])

        graph = generate_digraph_from_unique_word_set(unique_word_set, tagged)

        generate_pagerank_graph = generate_pagerank_from_graph(graph)

        sorted_pagerank_graph = sort_pagerank_items(generate_pagerank_graph)

        most_representative_words = extract_most_representative_ranked_words(sorted_pagerank_graph)

        #Puntuando Tweets
        ranked_tweets = sort_tweets_by_rank(most_representative_words, tweets)

        search_query = sw_utils.sanitize_string_for_search(topic['title'])
        search_query_words = nltk.word_tokenize(search_query)

        words_to_expand = get_words_to_expand(most_representative_words, search_query_words)

        word_combinations = get_word_combinations(search_query_words, words_to_expand)

        add_new_tweets_to_tweets_list(ranked_tweets, topic, word_combinations)

        for tweet in ranked_tweets:
            print tweet[u'date'].isoformat() + str(tweet[u'rank']) + ' - ' + tweet[u'content']

        start_date = topic['date']-timedelta(days=1)
        end_date = topic['date']

        print 'From date: ' + start_date.isoformat() + ' to ' + end_date.isoformat()
        with open(ranked_tweets_path+str(i)+ranked_tweets_file+'.txt', "a") as f:
            for tweet in ranked_tweets:
                f.write(str(tweet[u'rank'])+','+tweet[u'date'].isoformat() + ',' + tweet[u'content'].encode('utf-8') + '\n')
                print tweet[u'date'].isoformat() + str(tweet[u'rank']) + ' - ' + tweet[u'content']

        search_query = sw_utils.sanitize_string_for_search(topic['title'])
        print 'Tweets for: ' + search_query
        start_date = topic['date']-timedelta(days=1)
        end_date = topic['date']

        with open(ranked_words_path+str(i)+ranked_words_file+'.txt', "a") as f:
            for word in most_representative_words:
                f.write(str(word[u'rank']) + ',' + word[u'word'].encode('utf-8') + '\n')
                print str(word[u'rank']) + " " + word[u'word']
        i += 1
Ejemplo n.º 3
0
def process_english_tweet(tweet_content, tweet_date, tweet_id):
    tweet_stemmed = sw_utils.sanitize_string_for_search(tweet_content)
    #Solamente interesan aquellos tweets de las ultimas 24 horas y por tanto deben tener una fecha
    if tweet_date != 'NO_TIME':
        tweet_date = parser.parse(tweet_date)
        print str(tweet_date) + " " + tweet_stemmed