def getEmoticons(tokenised_tweet, emo_dict): feature_emoticons = {FEATURE_COUNT: 0, FEATURE_VALUE: {}} for word in tokenised_tweet: if isEmoticon(word, emo_dict): feature_emoticons[FEATURE_COUNT] += 1 addToCountDict(feature_emoticons[FEATURE_VALUE], word, 1) return feature_emoticons
def getUniqueFeaturesForClass(processed_tweets_list, categorywide_unique_features, features_used=FEATURES_SA_DEFAULT): ''' Params: processed_tweets_list: [] of tweets of class categorywide_unique_features: {} to keep track of unique features of a category e.g. apple, google,... features_used: [] of features in use from CS4242_Assg2.constants Return: class_unique_features: { FEATURE_TYPE_... : {feature: value}} ''' class_unique_features = {} for feature in features_used: # TODO: whitelist if feature == FEATURE_SA_REPLIES or feature == FEATURE_SA_TEMPORAL: continue class_unique_features[feature] = {} for processed_tweet in processed_tweets_list: for feature in features_used: # TODO: whitelist if feature == FEATURE_SA_REPLIES or feature == FEATURE_SA_TEMPORAL: continue val_dict = processed_tweet[TWEET_FEATURES][feature][FEATURE_VALUE] for key, val in val_dict.iteritems(): addToCountDict(categorywide_unique_features[feature], key, 1) addToCountDict(class_unique_features[feature], key, 1) return class_unique_features
def getCapitalisedText(tokenised_tweet): feature_caps = {FEATURE_COUNT: 0, FEATURE_VALUE: {}} for word in tokenised_tweet: # split_words = removePunctuationsAndNumbers(word) # for w in split_words: if isFullCaps(word): feature_caps[FEATURE_COUNT] += 1 addToCountDict(feature_caps[FEATURE_VALUE], word, 1) return feature_caps
def getHashTags_SA(json_data, tokenised_tweet_no_filter): feature_ht = {FEATURE_COUNT: 0, FEATURE_VALUE: {}} for ht in json_data['entities']['hashtags']: feature_ht[FEATURE_COUNT] += 1 addToCountDict(feature_ht[FEATURE_VALUE], "#HT_" + ht['text'], 1) for ht in getUncapturedHashtags(tokenised_tweet_no_filter): feature_ht[FEATURE_COUNT] += 1 addToCountDict(feature_ht[FEATURE_VALUE], "#HT_" + ht, 1) return feature_ht
def getPosFeatures(pos_tagged_tweet): ''' { FEATURE_COUNT: number of POS (a,n,v,r,other) features in tweet FEATURE_VALUE: {pos_tag: number of pos_tag features in tweet} } ''' feature_pos = {FEATURE_COUNT: 0, FEATURE_VALUE: {}} for word, pos in pos_tagged_tweet.iteritems(): if pos != POS_OTHERS: feature_pos[FEATURE_COUNT] += 1 addToCountDict(feature_pos[FEATURE_VALUE], "POS_TAG_%s" % pos, 1) return feature_pos
def extractTextFeatures(json_data): tweet_words = {} # Remove unnecessary tokens tweet_text = filterIrrelevantTokens(json_data) unicode_normalized_tweet = unicodedata.normalize( 'NFKD', tweet_text).encode('ascii', 'ignore').lower() tweet_wordlist = normalizer.normalizeTweet(unicode_normalized_tweet) # tweet wordlist = ['word', 'word2'] etc for word in tweet_wordlist: stripped_punct_num_word = removePunctuationsAndNumbers(word) for w in stripped_punct_num_word: if w not in stopwords.words('english'): w2 = stemmer.stem(w) addToCountDict(tweet_words, w2, 1) return tweet_words
def getUserMentions_SA(json_data, tokenised_tweet_no_filter): ''' Get user ids of users mentioned in tweet, include tweeter of tweet ''' feature_obj = {FEATURE_COUNT: 0, FEATURE_VALUE: {}} for um in json_data['entities']['user_mentions']: feature_obj[FEATURE_COUNT] += 1 addToCountDict(feature_obj[FEATURE_VALUE], "@UM_" + um['id_str'], 1) # addToCountDict(feature_obj[FEATURE_VALUE], "@UM_" + str(json_data['user']['id']), 1) for um in getUncapturedUserMentions(tokenised_tweet_no_filter): feature_obj[FEATURE_COUNT] += 1 addToCountDict(feature_obj[FEATURE_VALUE], "@UM_" + um, 1) for um in feature_obj[FEATURE_VALUE]: feature_obj[FEATURE_VALUE][um] = 1 return feature_obj
def getText(tokenised_tweet, negation_flags, use_negation=False): feature_text = {FEATURE_COUNT: 0, FEATURE_VALUE: {}} for word in tokenised_tweet: negated = False if word in negation_flags and negation_flags[word]: negated = True if isEmoticon(word, emo_dict): continue is_slang = isSlang(word, sd) if is_slang: translated_slang = translateSlangWord(word, sd) tokenised_slang = tokenizer.tokenize(translated_slang) if len(tokenised_slang) == 1: # translated slang split_words = tokenised_slang is_slang = False else: # don't remove punctuation/numbers split_words = [word] # print split_words if not is_slang: split_words = removePunctuationsAndNumbers(word) for w in split_words: w = w.lower() if w in stopwords.words('english'): # skip stopwords continue if not is_slang: # stem if its not slang w = stemmer.stem(w) if isFullCaps(word): # preserve case for words in full w = w.upper() if use_negation and negated: w = "NOT_%s" % w # print w feature_text[FEATURE_COUNT] += 1 addToCountDict(feature_text[FEATURE_VALUE], w, 1) # TODO: should we use presence? results will drop abit # for text in feature_text[FEATURE_VALUE]: # feature_text[FEATURE_VALUE][text] = 1 return feature_text
def getPolarityPosCount(pos_tagged_tweet, tweet_word_polarity): ''' { FEATURE_COUNT: number of text features of POS (a,n,v,r) with polarity in tweet FEATURE_VALUE: {polarity type: number of text features of POS (a,n,v,r) with polarity in tweet} } ''' feature_pol = {FEATURE_COUNT: 0, FEATURE_VALUE: {}} for word, pos in pos_tagged_tweet.iteritems(): # word_polarity = getPolarityFromSWN(word, pos, swn) # empty string if word not found if pos == POS_OTHERS: continue word_polarity = getPolarityOfWord(word, tweet_word_polarity) if word_polarity == POLARITY_POSITIVE: feature_pol[FEATURE_COUNT] += 1 addToCountDict(feature_pol[FEATURE_VALUE], "POLARTITY_PoS_POS", 1) if word_polarity == POLARITY_NEGATIVE: feature_pol[FEATURE_COUNT] += 1 addToCountDict(feature_pol[FEATURE_VALUE], "POLARTITY_PoS_NEG", 1) if word_polarity == POLARITY_NEUTRAL: feature_pol[FEATURE_COUNT] += 1 addToCountDict(feature_pol[FEATURE_VALUE], "POLARTITY_PoS_NEU", 1) return feature_pol
def getEmoticonPolarityCount(tokenised_tweet, emo_dict): feature_emoticons = { FEATURE_COUNT: 0, FEATURE_VALUE: { 'EMO_V_POS': 0, 'EMO_POS': 0, 'EMO_NEU': 0, 'EMO_NEG': 0, 'EMO_V_NEG': 0 } } for word in tokenised_tweet: if word in emo_dict: polarity = getEmoticonPolarity(word, emo_dict) if polarity == POLARITY_VERY_POSITIVE: feature_emoticons[FEATURE_COUNT] += 1 addToCountDict(feature_emoticons[FEATURE_VALUE], 'EMO_V_POS', 1) elif polarity == POLARITY_POSITIVE: feature_emoticons[FEATURE_COUNT] += 1 addToCountDict(feature_emoticons[FEATURE_VALUE], 'EMO_POS', 1) elif polarity == POLARITY_NEUTRAL: feature_emoticons[FEATURE_COUNT] += 1 addToCountDict(feature_emoticons[FEATURE_VALUE], 'EMO_NEU', 1) elif polarity == POLARITY_NEGATIVE: feature_emoticons[FEATURE_COUNT] += 1 addToCountDict(feature_emoticons[FEATURE_VALUE], 'EMO_NEG', 1) elif polarity == POLARITY_VERY_NEGATIVE: feature_emoticons[FEATURE_COUNT] += 1 addToCountDict(feature_emoticons[FEATURE_VALUE], 'EMO_V_NEG', 1) # addToCountDict(feature_emoticons[FEATURE_VALUE], word, 1) # print feature_emoticons return feature_emoticons
def getCapitalisedTextPolarityCount(cap_text, tweet_word_polarity): feature_pol = {FEATURE_COUNT: 0, FEATURE_VALUE: {}} for word in cap_text[FEATURE_VALUE]: word_polarity = getPolarityOfWord(word, tweet_word_polarity) if word_polarity == POLARITY_POSITIVE: feature_pol[FEATURE_COUNT] += 1 addToCountDict(feature_pol[FEATURE_VALUE], "POLARTITY_CAPS_POS", 1) if word_polarity == POLARITY_NEGATIVE: feature_pol[FEATURE_COUNT] += 1 addToCountDict(feature_pol[FEATURE_VALUE], "POLARTITY_CAPS_NEG", 1) if word_polarity == POLARITY_NEUTRAL: feature_pol[FEATURE_COUNT] += 1 addToCountDict(feature_pol[FEATURE_VALUE], "POLARTITY_CAPS_NEU", 1) return feature_pol
def getTwitterTokenCount(json_data, tokenised_tweet_no_filter): feature_obj = {FEATURE_COUNT: 0, FEATURE_VALUE: {}} for twitter_token, entity in json_data['entities'].iteritems(): for value in entity: feature_obj[FEATURE_COUNT] += 1 addToCountDict(feature_obj[FEATURE_VALUE], "TWITTER_TOKEN_" + twitter_token, 1) for token in tokenised_tweet_no_filter: if re.match(r"^#", token): feature_obj[FEATURE_COUNT] += 1 addToCountDict(feature_obj[FEATURE_VALUE], 'TWITTER_TOKEN_hashtags', 1) if re.match(r"^@", token): feature_obj[FEATURE_COUNT] += 1 addToCountDict(feature_obj[FEATURE_VALUE], 'TWITTER_TOKEN_user_mentions', 1) if isValidUrl(token): feature_obj[FEATURE_COUNT] += 1 addToCountDict(feature_obj[FEATURE_VALUE], 'TWITTER_TOKEN_urls', 1) # print feature_obj return feature_obj
def getPolarityTextCount(pos_tagged_tweet, tweet_word_polarity): ''' { FEATURE_COUNT: number of text features with polarity FEATURE_VALUE: {text: polarity score of text} } ''' feature_pol = {FEATURE_COUNT: 0, FEATURE_VALUE: {}} for word, pos in pos_tagged_tweet.iteritems(): word_polarity = getPolarityOfWord(word, tweet_word_polarity) if word_polarity == POLARITY_POSITIVE: feature_pol[FEATURE_COUNT] += 1 addToCountDict(feature_pol[FEATURE_VALUE], "POLARITY_TEXT_POS", 1) elif word_polarity == POLARITY_NEGATIVE: feature_pol[FEATURE_COUNT] += 1 addToCountDict(feature_pol[FEATURE_VALUE], "POLARITY_TEXT_NEG", 1) elif word_polarity == POLARITY_NEUTRAL: feature_pol[FEATURE_COUNT] += 1 addToCountDict(feature_pol[FEATURE_VALUE], "POLARITY_TEXT_NEU", 1) return feature_pol
def calculateChi2ValuesSA(unique_feat, feature_set, sample_size_set): ''' Calculate Chi2 values of all unique features using lecture's formula Params: unique_feat: [feat] of unique features feature_set: {senti_type: features} of features from each sentiment sample_size_set: {senti_type:sample_tweet_count} of each sentiment Returns: chi2_val_list: {senti_type: {feat: chi2_val}}of calculated chi2 values ''' # print 'using calculateChi2Values...' sentiment_feature_set = {} for target_sentiment in feature_set.iterkeys(): target_senti_feat_dict = feature_set[target_sentiment] target_senti_sample_size = sample_size_set[target_sentiment] # merge the other sentiment feature dictionaries other_senti_feat_dict = {} other_senti_sample_size = 0 for sentiment in feature_set.iterkeys(): if sentiment != target_sentiment: for item, value in feature_set[sentiment].iteritems(): addToCountDict(other_senti_feat_dict, item, value) other_senti_sample_size += sample_size_set[sentiment] target_sent_feat_set = {} target_sent_feat_set['target_feat_dict'] = target_senti_feat_dict target_sent_feat_set['target_sample_size'] = sum( target_senti_feat_dict.itervalues()) target_sent_feat_set['other_feat_dict'] = other_senti_feat_dict target_sent_feat_set['other_sample_size'] = sum( other_senti_feat_dict.itervalues()) sentiment_feature_set[target_sentiment] = target_sent_feat_set # target_senti_sample_size = sum(target_senti_feat_dict.itervalues()) # other_senti_sample_size = sum(other_senti_feat_dict.itervalues()) # print target_sent_feat_set['target_sample_size'], target_sent_feat_set['other_sample_size'] # print sum(unique_feat.itervalues()) chi2_val_dict = {} for target_sentiment in feature_set.iterkeys(): target_sent_feat_set = sentiment_feature_set[target_sentiment] target_senti_feat_dict = target_sent_feat_set['target_feat_dict'] target_senti_sample_size = target_sent_feat_set['target_sample_size'] other_senti_feat_dict = target_sent_feat_set['other_feat_dict'] other_senti_sample_size = target_sent_feat_set['other_sample_size'] chi2_word_val = {} for f in target_senti_feat_dict.iterkeys(): # calculate chi2 A = 0 # num of tweets in target cat that contains feature B = 0 # num of tweets not in target cat that contains feature C = 0 # num of tweets in target cat that doesnt contains feature D = 0 # num of tweets not in target cat that doesnt contains feature if f in target_senti_feat_dict: A = target_senti_feat_dict[f] if f in other_senti_feat_dict: B = other_senti_feat_dict[f] C = target_senti_sample_size - A D = other_senti_sample_size - B chi2_word_val[f] = calculateChi2(A, B, C, D) # print f,A,B,C,D, chi2_word_val[f] # break chi2_val_dict[target_sentiment] = chi2_word_val return chi2_val_dict
def getKeyInfoForClassifier(filename, categories_list, groundtruth_list, features=FEATURES_DEFAULT): ''' Extracts all features from input files. Params: filename: input DATA file categories_list: list of categories, e.g ['Apple', 'Google', 'Twitter'] groundtruth_list: list of groundtruths, e.g [{CATEGORY: category, POLARITY: polarity, TWEET_ID: tweetid}] features (optional): [FEATURE_TEXT, FEATURE_HASHTAG, FEATURE_GEOINFO, FEATURE_FOLLOWED_CATEGORIES, FEATURE_USER, FEATURE_USER_MENTIONS] Returns: { 'category' : { POSITIVE:{ PROCESSED_TWEETS : [{ TWEET_FULL: This was a triumph, TWEET_FEATURES: { FEATURE_TEXT: {} , FEATURE_GEOLOCATION : str } }], FEATURES: { FEATURE_TEXT: {} ... } }, NEGATIVE:{ PROCESSED_TWEETS : [{ TWEET_FULL: This was a triumph, TWEET_FEATURES: { FEATURE_TEXT: {} , FEATURE_GEOLOCATION : str } }], FEATURES: { FEATURE_TEXT: {} ... FEATURE_HASHTAG: [] } }, UNIQUE_FEATURES: { FEATURE_TEXT: {} , ... } } } ''' returnmap = {} for category in categories_list: # category wide variables processed_tweets_list = [] unique_features_map = {} positive_processed_tweet_list = [] negative_processed_tweet_list = [] positive_features_map = {} negative_features_map = {} # Initialize unique feature maps within category for feature in features: if feature == FEATURE_TEXT: feature_text_unique = {} pos_feature_text_unique = {} neg_feature_text_unique = {} elif feature == FEATURE_HASHTAG: feature_hashtag_unique = {} pos_feature_hashtag_unique = {} neg_feature_hashtag_unique = {} elif feature == FEATURE_GEOINFO: feature_geoinfo_unique = {} pos_feature_geoinfo_unique = {} neg_feature_geoinfo_unique = {} elif feature == FEATURE_FOLLOWED_CATEGORIES: feature_followed_cat_unique = {} pos_feature_followed_cat_unique = {} neg_feature_followed_cat_unique = {} elif feature == FEATURE_USER: feature_user_unique = {} pos_feature_user_unique = {} neg_feature_user_unique = {} elif feature == FEATURE_USER_MENTIONS: feature_usermentions_unique = {} pos_feature_usermentions_unique = {} neg_feature_usermentions_unique = {} elif feature == FEATURE_CATEGORY: feature_category_unique = {} pos_feature_category_unique = {} neg_feature_category_unique = {} # Extract & Process Features with codecs.open(filename, encoding='cp1252') as k: for index, line in enumerate(k): json_data = json.loads(line, encoding='cp1252') tweet_keyinfo = extractFeaturesFromTweet( json_data, categories_list, features, category) if groundtruth_list[index][CATEGORY] == category: positive_processed_tweet_list.append(tweet_keyinfo) else: negative_processed_tweet_list.append(tweet_keyinfo) processed_tweets_list.append(tweet_keyinfo) # check unique tweet_keyinfo across positive, negative and all for tweet_keyinfo in positive_processed_tweet_list: for feature in features: if feature == FEATURE_TEXT: text_count_dict = tweet_keyinfo[TWEET_FEATURES][ FEATURE_TEXT] for key, count in text_count_dict.iteritems(): addToCountDict(pos_feature_text_unique, key, 1) elif feature == FEATURE_HASHTAG: for hashtag in tweet_keyinfo[TWEET_FEATURES][ FEATURE_HASHTAG]: addToCountDict(pos_feature_hashtag_unique, hashtag, 1) elif feature == FEATURE_GEOINFO: if tweet_keyinfo[TWEET_FEATURES][FEATURE_GEOINFO] != '': addToCountDict( pos_feature_geoinfo_unique, tweet_keyinfo[TWEET_FEATURES][FEATURE_GEOINFO], 1) elif feature == FEATURE_FOLLOWED_CATEGORIES: # TODO: Consider implementing pass elif feature == FEATURE_USER: addToCountDict(pos_feature_user_unique, tweet_keyinfo[TWEET_FEATURES][FEATURE_USER], 1) elif feature == FEATURE_USER_MENTIONS: for usermention in tweet_keyinfo[TWEET_FEATURES][ FEATURE_USER_MENTIONS]: addToCountDict(pos_feature_usermentions_unique, usermention, 1) elif feature == FEATURE_CATEGORY: for item in tweet_keyinfo[TWEET_FEATURES][ FEATURE_CATEGORY]: addToCountDict(pos_feature_category_unique, item, 1) # debug files will be written only if settings.DEBUG_CODE = True # writeDebugCountDictToFile("%s_pos_feature_text_unique.txt" % category, pos_feature_text_unique) # writeDebugCountDictToFile("%s_pos_feature_hashtag_unique.txt" % category, pos_feature_hashtag_unique) # writeDebugCountDictToFile("%s_pos_feature_geoinfo_unique.txt" % category, pos_feature_geoinfo_unique) # writeDebugCountDictToFile("%s_pos_feature_user_unique.txt" % category, pos_feature_user_unique) # writeDebugCountDictToFile("%s_pos_feature_usermentions_unique.txt" % category, pos_feature_usermentions_unique) for tweet_keyinfo in negative_processed_tweet_list: for feature in features: if feature == FEATURE_TEXT: text_count_dict = tweet_keyinfo[TWEET_FEATURES][ FEATURE_TEXT] for key, count in text_count_dict.iteritems(): addToCountDict(neg_feature_text_unique, key, 1) elif feature == FEATURE_HASHTAG: for hashtag in tweet_keyinfo[TWEET_FEATURES][ FEATURE_HASHTAG]: addToCountDict(neg_feature_hashtag_unique, hashtag, 1) elif feature == FEATURE_GEOINFO: addToCountDict( neg_feature_geoinfo_unique, tweet_keyinfo[TWEET_FEATURES][FEATURE_GEOINFO], 1) elif feature == FEATURE_FOLLOWED_CATEGORIES: # TODO: Consider implementing pass elif feature == FEATURE_USER: addToCountDict(neg_feature_user_unique, tweet_keyinfo[TWEET_FEATURES][FEATURE_USER], 1) elif feature == FEATURE_USER_MENTIONS: for usermention in tweet_keyinfo[TWEET_FEATURES][ FEATURE_USER_MENTIONS]: addToCountDict(neg_feature_usermentions_unique, usermention, 1) elif feature == FEATURE_CATEGORY: for item in tweet_keyinfo[TWEET_FEATURES][ FEATURE_CATEGORY]: addToCountDict(neg_feature_category_unique, item, 1) for tweet_keyinfo in processed_tweets_list: for feature in features: if feature == FEATURE_TEXT: text_count_dict = tweet_keyinfo[TWEET_FEATURES][ FEATURE_TEXT] for key, count in text_count_dict.iteritems(): addToCountDict(feature_text_unique, key, 1) elif feature == FEATURE_HASHTAG: for hashtag in tweet_keyinfo[TWEET_FEATURES][ FEATURE_HASHTAG]: addToCountDict(feature_hashtag_unique, hashtag, 1) elif feature == FEATURE_GEOINFO: addToCountDict( feature_geoinfo_unique, tweet_keyinfo[TWEET_FEATURES][FEATURE_GEOINFO], 1) elif feature == FEATURE_FOLLOWED_CATEGORIES: # TODO: Consider implementing pass elif feature == FEATURE_USER: addToCountDict(feature_user_unique, tweet_keyinfo[TWEET_FEATURES][FEATURE_USER], 1) elif feature == FEATURE_USER_MENTIONS: for usermention in tweet_keyinfo[TWEET_FEATURES][ FEATURE_USER_MENTIONS]: addToCountDict(feature_usermentions_unique, usermention, 1) elif feature == FEATURE_CATEGORY: for item in tweet_keyinfo[TWEET_FEATURES][ FEATURE_CATEGORY]: addToCountDict(feature_category_unique, item, 1) for feature in features: if feature == FEATURE_TEXT: unique_features_map[FEATURE_TEXT] = feature_text_unique positive_features_map[FEATURE_TEXT] = pos_feature_text_unique negative_features_map[FEATURE_TEXT] = neg_feature_text_unique elif feature == FEATURE_HASHTAG: unique_features_map[FEATURE_HASHTAG] = feature_hashtag_unique positive_features_map[ FEATURE_HASHTAG] = pos_feature_hashtag_unique negative_features_map[ FEATURE_HASHTAG] = neg_feature_hashtag_unique elif feature == FEATURE_GEOINFO: unique_features_map[FEATURE_GEOINFO] = feature_geoinfo_unique positive_features_map[ FEATURE_GEOINFO] = pos_feature_geoinfo_unique negative_features_map[ FEATURE_GEOINFO] = neg_feature_geoinfo_unique elif feature == FEATURE_FOLLOWED_CATEGORIES: unique_features_map[ FEATURE_FOLLOWED_CATEGORIES] = feature_followed_cat_unique positive_features_map[ FEATURE_FOLLOWED_CATEGORIES] = pos_feature_followed_cat_unique negative_features_map[ FEATURE_FOLLOWED_CATEGORIES] = neg_feature_followed_cat_unique elif feature == FEATURE_USER: unique_features_map[FEATURE_USER] = feature_user_unique positive_features_map[FEATURE_USER] = pos_feature_user_unique negative_features_map[FEATURE_USER] = neg_feature_user_unique elif feature == FEATURE_USER_MENTIONS: unique_features_map[ FEATURE_USER_MENTIONS] = feature_usermentions_unique positive_features_map[ FEATURE_USER_MENTIONS] = pos_feature_usermentions_unique negative_features_map[ FEATURE_USER_MENTIONS] = neg_feature_usermentions_unique elif feature == FEATURE_CATEGORY: unique_features_map[FEATURE_CATEGORY] = feature_category_unique positive_features_map[ FEATURE_CATEGORY] = pos_feature_category_unique negative_features_map[ FEATURE_CATEGORY] = neg_feature_category_unique returnmap[category] = {} returnmap[category][POSITIVE] = { PROCESSED_TWEETS: positive_processed_tweet_list, FEATURES: positive_features_map } returnmap[category][NEGATIVE] = { PROCESSED_TWEETS: negative_processed_tweet_list, FEATURES: negative_features_map } returnmap[category][UNIQUE_FEATURES] = unique_features_map return returnmap