def elongated(microblog): '''Elongated''' feature = [] has_elongated = 0 has_several_elongated = 0 text = microblog["raw_tweet"] elongated_count = 0 text = " " + text.strip() + " " while re.search(r" [\S]*(\w)\1{2,10}[\S]*[ .,?!\"]", text): elongated_count += 1 comp = re.search(r" [\S]*(\w)\1{2,10}[\S]* ", text) elongated = comp.group().strip() elongated_char = comp.groups()[0] elongated_1 = re.sub(elongated_char + "{3,11}", elongated_char, elongated) elongated_2 = re.sub(elongated_char + "{3,11}", elongated_char * 2, elongated) if normal_word[elongated_1] >= normal_word[elongated_2]: text = re.sub(elongated_char + "{3,11}", elongated_char, text) else: text = re.sub(elongated_char + "{3,11}", elongated_char * 2, text) if elongated_count != 0: if elongated_count == 1: has_elongated = 1 else: has_elongated = 1 has_several_elongated = 1 feature.append(has_elongated) feature.append(has_several_elongated) return util.get_feature_by_list(feature)
def wv_google(tweet): google_vec = DictLoader().get("embed_Word2Vec") # 返回的是900 300维sum 300维max 300维min word2vec = dict_util.get_w2v(tweet, google_vec) # print("!!!!!") # print(len(word2vec)) return util.get_feature_by_list(word2vec)
def punction(microblog): '''Punctuation''' feature = [] has_exclamation = 0 has_several_exclamation = 0 has_question = 0 has_several_question = 0 has_exclamation_question = 0 end_exclamation = 0 end_question = 0 # print microblog["parsed_text"]["tokens"] if microblog["tokens"]: tokens = [] #本句子的所有tokens token_lists = microblog["tokens"] for token_list in token_lists: for word in token_list: tokens.append(word) sentence = " ".join(tokens) # print tokens exclamation_list = re.findall("!", sentence) if len(exclamation_list) != 0: #无感叹号 has_exclamation = 1 if len(exclamation_list) > 2: has_several_exclamation = 1 question_list = re.findall("\?", sentence) if len(question_list) != 0: has_question = 1 if len(question_list) > 2: has_several_question = 1 excla_ques_list = re.findall("!\?", sentence) ques_excla_list = re.findall("\?!", sentence) if not excla_ques_list and not ques_excla_list: has_exclamation_question = 1 end_exclamation = 1 if "!" in tokens[-1] else 0 end_question = 1 if "?" in tokens[-1] else 0 feature = [has_exclamation, has_several_exclamation, has_question, has_several_question, has_exclamation_question] feature.append(end_exclamation) feature.append(end_question) return util.get_feature_by_list(feature)
def emoticon(microblog): '''Emoticon''' feature = [] has_pos = 0 has_neg = 0 has_several_pos = 0 has_several_neg = 0 has_pos_neg = 0 noisy = { "-lrb-": "(", "-LRB-": "(", "-rrb-": ")", "-RRB-": ")" } if microblog["tokens"]: tokens = [] #本句子的所有tokens token_lists = microblog["tokens"] for token_list in token_lists: for word in token_list: for item in noisy: #分词后结果把)和(换为了rrb,lrb if item in word: word = word.replace(item, noisy[item]) tokens.append(word) for token in tokens: if token in dict_emoticon: score = dict_emoticon[token] if score == 1 : if has_pos == 0: has_pos = 1 else: has_several_pos = 1 if has_neg == 1: has_pos_neg = 1 if score == -1: if has_neg == 0: has_neg = 1 else: has_several_neg = 1 if has_pos == 1: has_pos_neg = 1 feature = [has_pos, has_several_pos, has_neg, has_several_neg, has_pos_neg] return util.get_feature_by_list(feature)
def allcaps(microblog): '''Allcaps''' feature = [] has_allcaps = 0 has_several_allcaps = 0 if microblog["tokens"]: tokens_list = microblog["tokens"] for token_list in tokens_list: if has_several_allcaps == 1: break else: for word in token_list: if word.isupper(): if has_allcaps == 0: has_allcaps = 1 else: has_several_allcaps = 1 break feature.append(has_allcaps) feature.append(has_several_allcaps) return util.get_feature_by_list(feature)
def ners_existed(tweet): ners_list = dict_util.get_ners_exist(tweet) return util.get_feature_by_list(ners_list)
def sentilexi(tweet): '''SentimentLexicon''' feature = [] #dict的value值都是1维score(若字典中本来有pos_score和neg_score,则pos_score-neg_score) Lexicon_dict_list = [ DictLoader().get("sent_BL"), DictLoader().get("sent_GI"), DictLoader().get("sent_IMDB"), DictLoader().get("sent_MPQA"), DictLoader().get("sent_NRCE"), DictLoader().get("sent_AF"), DictLoader().get("sent_NRC140_U"), DictLoader().get("sent_NRCH_U"), ] # tokens = list(itertools.chain(* # 将否定词后的4个词加上_NEG后缀 tokens = reverse_neg(tweet) for Lexicon in Lexicon_dict_list : score = [] for word in tokens: flag = -0.8 if word.endswith("_NEG") else 1 word = word.replace("_NEG", "") if word in Lexicon: score.append(Lexicon[word] * flag) if len(score) == 0: feature += [0] * 11 continue countPos, countNeg, countNeu = 0, 0, 0 length = len(score) * 1.0 for s in score: if s > 0.49: countPos += 1 elif s < -0.49: countNeg += 1 else: countNeu += 1 feature += [countPos, countNeg, countNeu, countPos / length, countNeg / length, countNeu / length, max(score), min(score)] finalscore = sum(score) # feature.append(finalscore) if finalscore > 0: feature += [1, 0] elif finalscore < 0: feature += [0, 1] else: feature += [0, 0] # pos_score = [t for t in score if t > 0] # neg_score = [t for t in score if t < 0] # feature.append(sum(pos_score)) # feature.append(sum(neg_score)) # if pos_score: # feature.append(pos_score[-1]) # else: # feature.append(0) # if neg_score: # feature.append(neg_score[-1]) # else: # feature.append(0) word = tokens[-1] flag = -0.8 if word.endswith("_NEG") else 1 word = word.replace("_NEG", "") if word in Lexicon: feature.append(Lexicon[word] * flag) else: feature.append(0) #Bigram Lexicons for Lexicon in [DictLoader().get("sent_NRC140_B"), DictLoader().get("sent_NRCH_B")]: score = [] bigram = list(nltk.ngrams(tokens, 2)) for index, bi in enumerate(bigram): flag = -0.8 if bi[0].endswith("_NEG") and bi[1].endswith("_NEG") else 1 bi = (bi[0].replace("_NEG", ""), bi[1].replace("_NEG", "")) bigram[index] = bi if bi in Lexicon: score.append(Lexicon[bi] * flag) if not score: feature += [0] * 11 continue countPos, countNeg, countNeu = 0, 0, 0 length = len(score) * 1.0 for s in score: if s > 0.49: countPos += 1 elif s < -0.49: countNeg += 1 else: countNeu += 1 feature += [countPos, countNeg, countNeu, countPos / length, countNeg / length, countNeu / length, max(score), min(score)] finalscore = sum(score) # feature.append(finalscore) if finalscore > 0: feature += [1, 0] elif finalscore < 0: feature += [0, 1] else: feature += [0, 0] pos_score = [t for t in score if t > 0] neg_score = [t for t in score if t < 0] # feature.append(sum(pos_score)) # feature.append(sum(neg_score)) # if pos_score: # feature.append(pos_score[-1]) # else: # feature.append(0) # if neg_score: # feature.append(neg_score[-1]) # else: # feature.append(0) bi = bigram[-1] flag = -0.8 if bi[0].endswith("_NEG") and bi[1].endswith("_NEG") else 1 bi = (bi[0].replace("_NEG", ""), bi[1].replace("_NEG", "")) if bi in Lexicon: feature.append(Lexicon[bi] * flag) else: feature.append(0) return util.get_feature_by_list(feature)
def wv_GloVe(tweet): GloVe_vec = DictLoader().get("embed_GloVe") word2vec = dict_util.get_w2v(tweet, GloVe_vec) return util.get_feature_by_list(word2vec)