def features_for_tweet(self, tweet_repr, sid): """ Model::features_for_tweet() Purpose: Generate features for a single tweet @param tweet. A 3-tuple representing a tweet @return A hash table of features """ # data begin = tweet_repr[0] end = tweet_repr[1] sentence = [ unicode(t.decode('utf-8')) for t in tweet_repr[2] ] phrase = sentence[begin:end+1] # Feature Dictionary features = {} # Normalize all text (tokenizer, stem, etc) corrected = sentence normalized = utilities.normalize_phrase_TaskA(corrected,ark_tweet=self.ark_tweet) flat_normed = [ w for words in normalized for w in words ] # Feature: unedited term unigrams for tok in phrase: if tok == '': continue if tok in utilities.stop_words: continue features[('unedited-uni-tok',tok)] = 1 # Term unigrams for tok in normalized[begin:end+1]: for word in tok: if word == '': continue if word[0] == '#': continue base = word if (word[-4:]!='_neg') else word[:-4] if base in utilities.stop_words: continue if base.lower() in common: toks = common[base.lower()].split() else: toks = [base] for t in toks: w = st.stem(t) if word[-4:] == '_neg': w += '_neg' weight = 1 if utilities.is_elongated_word(base): weight += 1 features[('stemmed_term_unigram', w)] = weight # Unigram context window window = 3 prefix_start = max(begin-window, 0) context = sentence[prefix_start:end+1+window] norm_context = [ w for t in normalized[begin:end+1] for w in t ] prefix_terms = [] for w in reversed([w for t in normalized[prefix_start:begin]for w in t]): if w == '': break prefix_terms.append(w) norm_context = list(reversed(prefix_terms)) + norm_context suffix_terms = [] for w in [ w for t in normalized[end+1:end+1+window] for w in t ]: if w == '': break suffix_terms.append(w) norm_context = norm_context + suffix_terms # Feature: Unigram context # Leading for word in norm_context: if word == '': continue w = word if (word[-4:]!='_neg') else word[:-4] if w in utilities.stop_words: continue w = st.stem(self.speller.correct_spelling([w])[0]) if word[-4:] == '_neg': w += '_neg' features[('leading_unigram', w)] = 1 ''' print sentence print phrase for k,v in features.items(): print '\t', k, '\t', v return features ''' # Feature: Lexicon Features if enabled_modules['lexicons']: #print '\n\n\n' #print 'LEX FEATS: ', sentence #print begin, end # Phrase in question lex_feats = lexicon_features(sentence,begin,end+1,ark_tweet=self.ark_tweet) context_feats = lexicon_features(sentence,prefix_start,end+1+window,ark_tweet=self.ark_tweet) features.update(lex_feats) ''' # Leading context prev_lex_feats = lexicon_features(sentence,prefix_start,begin, ark_tweet=self.ark_tweet) prev_lex_feats = {('prev-'+k[0],k[1]):v for k,v in prev_lex_feats.items()} features.update(prev_lex_feats) # Trailing context next_lex_feats = lexicon_features(sentence,end+1,end+1+window, ark_tweet=self.ark_tweet) next_lex_feats = {('next-'+k[0],k[1]):v for k,v in next_lex_feats.items()} features.update(next_lex_feats) ''' #print phrase #for k,v in lex_feats.items(): # print '\t', k, '\t', v #print #print lex_feats #print prev_lex_feats #print next_lex_feats # Feature: Split hashtag if enabled_modules['hashtag']: hashtags = [ w for w in context if len(w) and (w[0]=='#') ] for ht in hashtags: toks = hashtag.split_hashtag(ht) for tok in utilities.normalize_phrase_TaskB(toks): w = tok if tok[-4:]!='_neg' else tok[:-4] stemmed = st.stem(w) if tok[-4:] == '_neg': stemmed += '_neg' if len(w) < 2: continue if w in utilities.stop_words: continue features[('stemmed_term_unigram',stemmed)] = 1 #print #print sentence #print begin #print end #print phrase # Feature: Prefixes and Suffixes n = [2,3,4] for i,words in enumerate(normalized[begin:end+1]): for word in words: if len(word) < 2: continue for j in n: if word[-4:] == '_neg': word = word[:-4] prefix = word[:j ] suffix = word[-j:] #print '\tprefix: ', prefix #print '\tsuffix: ', suffix features[ ('prefix',prefix) ] = 1 features[ ('suffix',suffix) ] = 1 # Features: Special forms if any([ utilities.is_url(w) for w in phrase]): features[ ('contains_url',None) ] = 1 if any([ w and w[0]=='@' for w in phrase]): features[ ('contains_@' ,None) ] = 1 if any([ w and w[0] == '#' for w in phrase]): features[ ('contains_#' ,None) ] = 1 # Features: Misc position data features['first_unigram'] = sentence[begin] features[ 'last_unigram'] = sentence[ end] features['phrase_length'] = len(phrase) / 2.0 features['is_first'] = (begin == 0) features['is_last'] = (end == len(sentence)-1) # Feature: Whether every word is a stop word if all([ tok in utilities.stop_words for tok in phrase]): #print phrase features[ ('all_stopwords',None) ] = 1 # Feature: All Caps? (boolean) if re.search('^[^a-z]*[A-Z][A-Z][^a-z]$',''.join(phrase)): features[ ('all_caps',None) ] = 1 # Feature: All Punctuation? if re.search('^[^a-zA-Z0-9]+$',''.join(phrase)): features[ ('all_punct',None) ] = 1 # Feature: Emoticon Counts elabels = defaultdict(lambda:0) for word in norm_context: elabel = emoticons.emoticon_type(word) if elabel: elabels[elabel] += 1 for k,v in elabels.items(): featname = k + '-emoticon' features[featname] = v # Feature: Punctuation counts punct = {'!':0, '?':0, '.':0} for c in ''.join(context): if c in punct: punct[c] += 1 for k,v in punct.items(): featname = k + '-count' features[featname] = v # Features: character streaks text = ''.join(phrase) # !-streak matches = re.findall('!+',text) if matches: features['!-streak'] = max([len(w) for w in matches]) # ?-streak matches = re.findall('\\?+',text) if matches: features['?-streak'] = max([len(w) for w in matches]) # ?!-streak matches = re.findall('[!\\?]+',text) if matches: features['?!-streak'] = max([len(w) for w in matches]) # Feature: Contains elongated long word? (boolean) contains_elongated_word = False for word in phrase: if utilities.is_elongated_word(word): contains_elongated_word = True if contains_elongated_word: features[ ('contains_elongated_word',None) ] = 1 # Feature: Contains long word? (boolean) long_word_threshold = 10 contains_long_word = False for words in normalized[begin:end+1]: for word in words: if word[-4:]=='_neg': word = word[:-4] word = spell.remove_duplicates(word) if len(word) and word[0]=='#': continue word = word.strip(string.punctuation) if len(word) > long_word_threshold: contains_long_word = True if contains_long_word: features[ ('contains_long_word',None) ] = 1 return features
def features_for_tweet(self, tweet, sid): """ Model::features_for_tweet() Purpose: Generate features for a single tweet @param tweet. A string (the text of a tweet) @param sid. An int (the ID of a tweet) @return A hash table of features """ # Feature dictionary features = {} # POS list if enabled_modules['ark_tweet']: pos = self.ark_tweet.posTags(tweet) else: pos = None # Tweet representation (list of tokens/strings) phrase = utilities.tokenize(tweet, self.ark_tweet) ''' # Feature: Unedited Unigram Tokens for tok in phrase: if tok == '': continue if tf_idf.doc_freq(tok) < MIN_COUNT: continue if tok in tf_idf.stop_words: continue features[('unedited-uni-tok',tok)] = 1 ''' # Edit misspellings unis = self.speller.correct_spelling(phrase, pos) # Flatten from multi-word tokens flattened = [] flat_pos = [] for tok,tag in zip(unis,pos): for w in tok.split(): flattened.append(w) flat_pos.append(tag) # Normalize sentence normalized = utilities.normalize_phrase_TaskB(flattened) # Feature: Processed Unigram Tokens uni_freqs = defaultdict(lambda:0) for i,word in enumerate(normalized): if word == '': continue w = word if (word[-4:]!='_neg') else word[:-4] if tf_idf.doc_freq(w) < MIN_COUNT: continue if w in tf_idf.stop_words: continue # Exclude proper nouns and prepositions if flat_pos: if flat_pos[i] == '^': continue if flat_pos[i] == 'Z': continue if flat_pos[i] == 'P': continue if flat_pos[i] == 'O': continue uni_freqs[word] += 1 else: uni_freqs[word] += 1 feats = defaultdict(lambda:0) for key,tf in uni_freqs.items(): word = key if word[-4:] == '_neg': word = word[:-4] score = -1 else: score = 1 #feats[('uni_tok' , word) ] += score feats[('uni_stem_tok',st.stem(word))] += score features.update(feats) return features #''' # Feature: Split hashtag if enabled_modules['hashtag']: hashtags = [ w for w in normalized if len(w) and (w[0]=='#') ] for ht in hashtags: toks = hashtag.split_hashtag(ht) if (ht not in seen) and (ht not in hashtag.annotations): seen.add(ht) #print ht, '\t', toks for tok in utilities.normalize_phrase_TaskB(toks): if tok[-4:] == '_neg': tok = tok[:-4] score = -1 else: score = 1 if len(tok) > 2: if tf_idf.doc_freq(tok) < MIN_COUNT: continue if tok in tf_idf.stop_words: continue ###features[('uni_tok' , tok) ] = score features[('uni_stem_tok',st.stem(tok))] = score #''' #return features # Feature: Lexicon Features if enabled_modules['lexicons']: feats = lexicon_features(normalized) features.update(feats) return features # Feature: Punctuation counts for c in '!?': val = tweet.count(c) if val > 0: features['%s-count' % c] = val # Features: Text lengths #features['phrase_length'] = len(tweet) / 140.0 # Feature: Contains long word? (boolean) long_word_threshold = 8 contains_long_word = False for word in phrase: if len(word) == 0: continue if word[0] == '@': continue if len(word) > long_word_threshold: contains_long_word = True break if contains_long_word: features['contains_long_word'] = 1 # Feature: Emoticon Counts elabels = { 'positive':0, 'negative':0, 'neutral':0 } for word in phrase: elabel = emoticons.emoticon_type(word) if elabel: elabels[elabel] += 1 for k,v in elabels.items(): if v > 0: featname = k + '-emoticon' features[featname] = v # Features: contains twitter-specific features (hashtags & mentions) contains_hashtag = False contains_mention = False for tok in phrase: if tok == '': continue if tok[0] == '@': contains_mention = True if tok[0] == '#': contains_hashtag = True if contains_hashtag: features['contains_hashtag'] = 1 if contains_mention: features['contains_mention'] = 1 return features # Feature: Bigram Tokens flattened = [] for tok in normalized: flattened += tok.split() for i in range(len(flattened)-1): bigram = tuple(flattened[i:i+2]) # short circuits if any(w == '' for w in bigram): continue if any(tf_idf.doc_freq(w) < MIN_COUNT for w in bigram): continue if any(w in tf_idf.stop_words for w in bigram): continue # context t1,t2 = bigram if t1[-4:] == '_neg': t1 = t1[:-4] score = -1 else: score = 1 if t2[-4:] == '_neg': t2 = t2[:-4] sbigram = (st.stem(t1),st.stem(t2)) features[( 'bigram_tok',(t1,t2))] = score features[('sbigram_tok',sbigram)] = score # Feature: Trigram Tokens for i in range(len(flattened)-2): trigram = tuple(flattened[i:i+3]) if any(w == '' for w in trigram): continue if any(tf_idf.doc_freq(phrase[i]) < MIN_COUNT for w in range(3)): continue if phrase[i] in tf_idf.stop_words: continue t1,t2,t3 = trigram if t1[-4:] == '_neg': t1 = t1[:-4] score = -1 else: score = 1 if t2[-4:] == '_neg': t2 = t2[:-4] if t3[-4:] == '_neg': t3 = t3[:-4] features[('trigram_tok',trigram)] = 1 #features[('strigram_tok',strigram)] = 1 # Feature: ark_tweet features (cached based on unescaped text) if enabled_modules['ark_tweet']: ark_feats = self.ark_tweet.features(tweet) features.update(ark_feats) ''' # Feature: twitter_data features if enabled_modules['twitter_data']: tdata_feats = self.twitter_data.features(sid) features.update(tdata_feats) # Feature: URL Features if enabled_modules['url']: urls = [ w for w in phrase if utilities.is_url(w) ] for url in urls: feats = self.url.features(url) features.update(feats) ''' if enabled_modules['ukb_wsd'] and enabled_modules['ark_tweet']: #add ukb wsd features if self.ukb.cache.has_key( tweet ): wordSenses = self.ukb.cache.get_map( tweet ) else: #print tweet wordSenses = self.ukb.ukb_wsd( phrase , self.ark_tweet.posTags( tweet ) ) self.ukb.cache.add_map( tweet , wordSenses ) for ws in wordSenses: for s in ws: if ('wsd',s[0]) in features.keys(): features[('wsd',s[0])] += s[1] else: features[('wsd',s[0])] = s[1] #print '\n\n\n' #print tweet #print #print features return features
def features_for_tweet(self, tweet, sid): """ Model::features_for_tweet() Purpose: Generate features for a single tweet @param tweet. A string (the text of a tweet) @param sid. An int (the ID of a tweet) @return A hash table of features """ # Feature dictionary features = {} # POS list if enabled_modules['ark_tweet']: pos = self.ark_tweet.posTags(tweet) else: pos = None # Tweet representation (list of tokens/strings) phrase = utilities.tokenize(tweet, self.ark_tweet) #''' # Feature: Unedited Unigram Tokens for tok in phrase: if tok == '': continue if tf_idf.doc_freq(tok) < MIN_COUNT: continue if tok in tf_idf.stop_words: continue features[('unedited-uni-tok',tok)] = 1 #''' # Edit misspellings unis = self.speller.correct_spelling(phrase, pos) # Flatten from multi-word tokens if pos: flattened = [] flat_pos = [] for tok,tag in zip(unis,pos): for w in tok.split(): flattened.append(w) flat_pos.append(tag) else: flattened = unis flat_pos = None # Normalize sentence normalized = utilities.normalize_phrase_TaskB(flattened) # Feature: Processed Unigram Tokens uni_freqs = defaultdict(lambda:0) for i,word in enumerate(normalized): if word == '': continue w = word if (word[-4:]!='_neg') else word[:-4] if tf_idf.doc_freq(w) < MIN_COUNT: continue if w in tf_idf.stop_words: continue # Exclude proper nouns and prepositions if flat_pos: if flat_pos[i] == '^': continue if flat_pos[i] == 'Z': continue if flat_pos[i] == 'P': continue if flat_pos[i] == 'O': continue uni_freqs[word] += 1 else: uni_freqs[word] += 1 feats = defaultdict(lambda:0) for key,tf in uni_freqs.items(): word = key if word[-4:] == '_neg': word = word[:-4] score = -1 else: score = 1 feats[('uni_tok' , word) ] += score feats[('uni_stem_tok',st.stem(word))] += score features.update(feats) #return features #''' # Feature: Split hashtag if enabled_modules['hashtag']: hashtags = [ w for w in normalized if len(w) and (w[0]=='#') ] for ht in hashtags: toks = hashtag.split_hashtag(ht) if (ht not in seen) and (ht not in hashtag.annotations): seen.add(ht) #print ht, '\t', toks for tok in utilities.normalize_phrase_TaskB(toks): if tok[-4:] == '_neg': tok = tok[:-4] score = -1 else: score = 1 if len(tok) > 2: if tf_idf.doc_freq(tok) < MIN_COUNT: continue if tok in tf_idf.stop_words: continue features[('uni_tok' , tok) ] = score features[('uni_stem_tok',st.stem(tok))] = score #''' #return features # Feature: Lexicon Features if enabled_modules['lexicons']: feats = lexicon_features(normalized) features.update(feats) #return features # Feature: Punctuation counts for c in '!?': val = tweet.count(c) if val > 0: features['%s-count' % c] = val # Features: Text lengths #features['phrase_length'] = len(tweet) / 140.0 # Feature: Contains long word? (boolean) long_word_threshold = 8 contains_long_word = False for word in phrase: if len(word) == 0: continue if word[0] == '@': continue if len(word) > long_word_threshold: contains_long_word = True break if contains_long_word: features['contains_long_word'] = 1 # Feature: Emoticon Counts elabels = { 'positive':0, 'negative':0, 'neutral':0 } for word in phrase: elabel = emoticons.emoticon_type(word) if elabel: elabels[elabel] += 1 for k,v in elabels.items(): if v > 0: featname = k + '-emoticon' features[featname] = v # Features: contains twitter-specific features (hashtags & mentions) contains_hashtag = False contains_mention = False for tok in phrase: if tok == '': continue if tok[0] == '@': contains_mention = True if tok[0] == '#': contains_hashtag = True if contains_hashtag: features['contains_hashtag'] = 1 if contains_mention: features['contains_mention'] = 1 #return features # Feature: Bigram Tokens flattened = [] for tok in normalized: flattened += tok.split() for i in range(len(flattened)-1): bigram = tuple(flattened[i:i+2]) # short circuits if any(w == '' for w in bigram): continue if any(tf_idf.doc_freq(w) < MIN_COUNT for w in bigram): continue if any(w in tf_idf.stop_words for w in bigram): continue # context t1,t2 = bigram if t1[-4:] == '_neg': t1 = t1[:-4] score = -1 else: score = 1 if t2[-4:] == '_neg': t2 = t2[:-4] sbigram = (st.stem(t1),st.stem(t2)) features[( 'bigram_tok',(t1,t2))] = score features[('sbigram_tok',sbigram)] = score return features # Feature: Trigram Tokens for i in range(len(flattened)-2): trigram = tuple(flattened[i:i+3]) if any(w == '' for w in trigram): continue if any(tf_idf.doc_freq(phrase[i]) < MIN_COUNT for w in range(3)): continue if phrase[i] in tf_idf.stop_words: continue t1,t2,t3 = trigram if t1[-4:] == '_neg': t1 = t1[:-4] score = -1 else: score = 1 if t2[-4:] == '_neg': t2 = t2[:-4] if t3[-4:] == '_neg': t3 = t3[:-4] features[('trigram_tok',trigram)] = 1 #features[('strigram_tok',strigram)] = 1 return features # Feature: ark_tweet features (cached based on unescaped text) if enabled_modules['ark_tweet']: ark_feats = self.ark_tweet.features(tweet) features.update(ark_feats) return features