def extract_more_decep_tech_features(self, tweets, vocab_file): #print 'Extracting decep_tech/decep_type features with training vocab' train_vocab = {} k = 0 for line in open(vocab_file): train_vocab[line.strip()] = k k += 1 #print 'Train vocab size=>' + str(len(train_vocab)) cv = CountVectorizer(ngram_range=(1, 1), binary=True, vocSuraiyalary=train_vocab) train_features_bow = cv.fit_transform(tweets) add_decep_tech_matrix = [] hash_pattern = re.compile('\#+[\w_]+[\w\'_\-]*[\w_]+') elong_pattern = re.compile("([a-zA-Z])\\1{2,}") caps_pattern = re.compile(('[A-Z][A-Z\d]+')) punc_pattern = re.compile('([.,!?]+)') for tweet in tweets: tweet_vector = [] tokens = twokenize.tokenize(tweet) #count the number of elongated tokens n_elong = len(re.findall(elong_pattern, tweet)) #count the number of all_caps tokens n_caps = len(re.findall(caps_pattern, tweet)) #count the number of repeated punctuation n_rep_punct = len(re.findall(punc_pattern, tweet)) #count the number of hasgtags n_hahtag = len(re.findall(hash_pattern, tweet)) #check if the tweets has SAD, HAPPY, BOTH_SH or NA emoticon emoticon_mood = emoticons.analyze_tweet(tweet.strip()) if emoticon_mood == 'NA': emoticon_mood = 0 elif emoticon_mood == 'HAPPY': emoticon_mood = 2 elif emoticon_mood == 'SAD': emoticon_mood = 1 elif emoticon_mood == 'BOTH_HS': emoticon_mood = 4 tweet_vector = [ n_elong, n_caps, n_rep_punct, n_hahtag, emoticon_mood ] add_decep_tech_matrix.append(tweet_vector) #print np.asarray(add_decep_tech_matrix).shape a = np.asarray(add_decep_tech_matrix) #print 'additional 5 features: ' + str(a) sa = sparse.csr_matrix(add_decep_tech_matrix) features = hstack([sa, train_features_bow]) #print 'final feature matrix size: ' + str(features.shape) return features
def extractEmoticons(tweets): vects = [] # BOTH_HS, HAPPY, SAD, NA vocab = ["BOTH_HS", "HAPPY", "SAD", "NA"] for i, tweet in enumerate(tweets): vect = np.zeros(4) emo = analyze_tweet(tweet) if emo == "NA": vect[0] = 1 elif emo == "HAPPY": vect[1] = 1 elif emo == "SAD": vect[2] = 1 elif emo == "BOTH_HS": vect[3] = 1 vects.append(vect) return vects, vocab
def tweet_features(tweet, bigrams=True): """ Extracts a list of features for a given tweet Features: - singletons, bigrams - hashtags already included - emoticons - repeated punctuation - all caps - dialog RT @ - sentiwordnet - slang / proper engish """ rawtext = tweet tokens = transform(rawtext) # singletons for tok in tokens: if not ONLY_PUNCTUATION_RE.match(tok): yield tok # bigrams if bigrams: for tok1, tok2 in itertools.izip(tokens[:-1], tokens[1:]): if not ONLY_PUNCTUATION_RE.match(tok1) and not ONLY_PUNCTUATION_RE.match(tok2): if tok1 < tok2: yield "<2>{},{}</2>".format(tok1, tok2) else: yield "<2>{},{}</2>".format(tok2, tok1) # emoticons for emoticon in emoticons.analyze_tweet(rawtext): yield "<e>{}</e>".format(emoticon) # repeated punctuation if REPEATED_PUNCTUATION_RE.search(rawtext): yield "<rp>!</rp>" # dialog if DIALOG_RE.search(rawtext): yield "<d>!</d>" # all caps if ALL_CAPS_RE.search(rawtext): yield "<ac>!</ac>"