def term_freq_histogram(tweet_file_name):

    frequency_dict = {}

    with open(tweet_file_name, 'r') as tweets:

        word_count = 0

        for tweet in tweets:

            tweet = extract_txt_from_json_string(tweet)
            tweet = strip_punct(tweet)

            tweet_words = [word.lower() for word in tweet.split()]

            for word in tweet_words:
                word_count += 1
                frequency_dict[word] = frequency_dict.get(word, 0) + 1

        for each_word, each_count in frequency_dict.iteritems():
            print each_word, each_count / float(word_count)
def term_freq_histogram(tweet_file_name):

    frequency_dict = {}

    with open(tweet_file_name, 'r') as tweets:

        word_count = 0

        for tweet in tweets:

            tweet = extract_txt_from_json_string(tweet)
            tweet = strip_punct(tweet)

            tweet_words = [word.lower() for word in tweet.split()]

            for word in tweet_words:
                word_count += 1
                frequency_dict[word] = frequency_dict.get(word, 0) + 1

        for each_word, each_count in frequency_dict.iteritems():
            print each_word, each_count/float(word_count)
def report_UNK_avg_sentiment(sentiments_file_name, tweet_file_name):
    '''Keeps a running dictionary of the average sentiment score for any words
        not found in the AFINN sentiment word score text file.

        Calculates based on average sentiment of tweet found in.
        Then averaged with all other occurrences found in the twitter text file
        provided.

        Could return dictionary to keep running affect dictionary between
        sessions.

    Args:
        sentiments_file_name (string): Filename of a text file containing
         Space delimited AFINN dictionary of word:sentiment-value pairs

        tweet_file_name (string): Filename of a text file containing a string
         representation of a json object from the Twitter API on each line.

    Returns:
        (None) prints output to stdout
    '''
    AFINN_dict = build_dict(sentiments_file_name)

    # Build a new dictionary of words not in AFINN
    # Update as we go along to update sentiment as we see more tweets
    # Useful if we want to save the dict in the end for later use
    # Keep separate from AFINN since those words are already scaled
    # This will have to be in the format:
    #  key: word
    #  value: [times seen, score]
    # This is to keep a running average of the word sentiment as we see more
    new_word_AFINN = {}

    with open(tweet_file_name, 'r') as tweets:
        for tweet in tweets:
            # Extract the text of tweet from json object/string
            tweet = extract_txt_from_json_string(tweet)

            tweet = strip_punct(tweet)

            # print 'SANITY CHECK'
            # print 'Original Tweet: {0}'.format(tweet.encode('utf-8'))
            tweet_words = [word.lower() for word in tweet.split()]
            # print 'Words list: {0}'.format(tweet_words)

            total_tweet_sentiment = score_tweet(tweet, AFINN_dict)

            for word in tweet_words:

                try:
                    # If word is in AFINN, print word:value
                    assert(AFINN_dict[word])
                    print word, AFINN_dict[word]
                except KeyError as e:
                    # Try to get current tuple value of word not in AFINN, or 0
                    #  if not found
                    new_value_list = new_word_AFINN.get(word, [0, 0])

                    # update times seen
                    new_value_list[0] += 1

                    # Set as average sentiment of words in tweet for unseen words
                    word_sent = total_tweet_sentiment / float(len(tweet_words))


                    # update running average value
                    new_score = (new_value_list[1] + word_sent) / new_value_list[0]
                    new_value_list[1] = new_score

                    new_word_AFINN[word] = new_value_list

                    print word, new_score
Ejemplo n.º 4
0
def report_UNK_avg_sentiment(sentiments_file_name, tweet_file_name):
    '''Keeps a running dictionary of the average sentiment score for any words
        not found in the AFINN sentiment word score text file.

        Calculates based on average sentiment of tweet found in.
        Then averaged with all other occurrences found in the twitter text file
        provided.

        Could return dictionary to keep running affect dictionary between
        sessions.

    Args:
        sentiments_file_name (string): Filename of a text file containing
         Space delimited AFINN dictionary of word:sentiment-value pairs

        tweet_file_name (string): Filename of a text file containing a string
         representation of a json object from the Twitter API on each line.

    Returns:
        (None) prints output to stdout
    '''
    AFINN_dict = build_dict(sentiments_file_name)

    # Build a new dictionary of words not in AFINN
    # Update as we go along to update sentiment as we see more tweets
    # Useful if we want to save the dict in the end for later use
    # Keep separate from AFINN since those words are already scaled
    # This will have to be in the format:
    #  key: word
    #  value: [times seen, score]
    # This is to keep a running average of the word sentiment as we see more
    new_word_AFINN = {}

    with open(tweet_file_name, 'r') as tweets:
        for tweet in tweets:
            # Extract the text of tweet from json object/string
            tweet = extract_txt_from_json_string(tweet)

            tweet = strip_punct(tweet)

            # print 'SANITY CHECK'
            # print 'Original Tweet: {0}'.format(tweet.encode('utf-8'))
            tweet_words = [word.lower() for word in tweet.split()]
            # print 'Words list: {0}'.format(tweet_words)

            total_tweet_sentiment = score_tweet(tweet, AFINN_dict)

            for word in tweet_words:

                try:
                    # If word is in AFINN, print word:value
                    assert (AFINN_dict[word])
                    print word, AFINN_dict[word]
                except KeyError as e:
                    # Try to get current tuple value of word not in AFINN, or 0
                    #  if not found
                    new_value_list = new_word_AFINN.get(word, [0, 0])

                    # update times seen
                    new_value_list[0] += 1

                    # Set as average sentiment of words in tweet for unseen words
                    word_sent = total_tweet_sentiment / float(len(tweet_words))

                    # update running average value
                    new_score = (new_value_list[1] +
                                 word_sent) / new_value_list[0]
                    new_value_list[1] = new_score

                    new_word_AFINN[word] = new_value_list

                    print word, new_score