def get_GSDMM_Feature(self, json): """ is tw a chat or not """ if tk.key_orgntext in json: json[tk.key_text] = pu.text_normalization(json[tk.key_orgntext]) else: text = json[tk.key_text] json[tk.key_orgntext] = text json[tk.key_text] = pu.text_normalization(text) topic_num = self.c.sample_cluster(json) if topic_num in self.is_noise_dict: return 1 return 0
def make_text_files(): for idx, file in enumerate(neg_2012_full_files): twarr = fu.load_array(file) txtarr = list() for tw in twarr: text = pu.text_normalization(tw[tk.key_text]) if pu.is_empty_string(text) or len(text) < 20: continue txtarr.append(text) print('len delta', len(twarr) - len(txtarr)) path = Path(file) out_file_name = '_'.join([path.parent.name, path.name]).replace('json', 'txt') out_file = ft_data_pattern.format(out_file_name) print(out_file) fu.write_lines(out_file, txtarr) return p_twarr_blocks = map(fu.load_array, pos_files) p_txtarr_blocks = map(twarr2textarr, p_twarr_blocks) p_txtarr = au.merge_array(list(p_txtarr_blocks)) p_out_file = ft_data_pattern.format('pos_2016.txt') fu.write_lines(p_out_file, p_txtarr) for f in neg_files: in_file = neg_event_pattern.format(f) out_file = ft_data_pattern.format(f.replace("json", "txt")) twarr = fu.load_array(in_file) txtarr = twarr2textarr(twarr) print(len(twarr), '->', len(txtarr), len(twarr) - len(txtarr)) fu.write_lines(out_file, txtarr)
def chatFilter(orgn): global reason c_f = res.NORMAL corpus = pu.text_normalization(orgn).lower().split() vec_bow = dictionary.doc2bow(corpus) vec_lda = lda[vec_bow] maxSim = 0.35 topicNum = -1 for sim in vec_lda: if sim[1] > maxSim: topicNum = sim[0] maxSim = sim[1] # print(topicNum) if topicNum != -1: # print(nd_text[count]) # print (nd_corpus[count]) if labeled_topic[topicNum] == 3: c_f = res.NOISY reason = "topic: " + str(topicNum) elif labeled_topic[topicNum] == 2: c_f = res.SUSPICIOUS else: c_f = res.NORMAL pass return c_f
def twarr2textarr(twarr): textarr = list() for tw in twarr: text = tw.get(tk.key_text).strip() if tk.key_orgntext not in tw: text = pu.text_normalization(text) if pu.is_empty_string(text): continue textarr.append(text) return textarr
def textarr2featurearr(self, textarr): vecarr = list() for text in textarr: try: ft_vec = self.get_ft_vector(text) except: text = pu.text_normalization(text) ft_vec = self.get_ft_vector(text) vecarr.append(ft_vec) return np.array(vecarr)
def textarr2featurearr_no_gpe(self, textarr): vecarr = list() for text in textarr: try: ft_vec = self.get_ft_vector(text) except: text = pu.text_normalization(text) ft_vec = self.get_ft_vector(text) ft_vec = np.append(ft_vec, self.has_keyword_feature(text)) vecarr.append(ft_vec) return np.array(vecarr)
def filter_twarr_text(twarr): """ This function only suits for tweets that are not processed """ flt_twarr = list() for tw in twarr: # TODO text_orgn = tw.get(tk.key_text, '').strip() text_orgn = tw.get(tk.key_orgntext, tw.get(tk.key_text, None)).strip() if not text_orgn: continue text_norm = pu.text_normalization(text_orgn).strip() if pu.is_empty_string(text_norm) or not pu.has_enough_alpha(text_norm, 0.65): continue tw[tk.key_orgntext] = text_orgn tw[tk.key_text] = text_norm flt_twarr.append(tw) return flt_twarr
def textarr2featurearr_no_gpe(self, textarr): """ 将输入文本列表转化为向量列表;其中,每条文本的向量除了fasttext提供的向量之外, 还拼接了 self.has_keyword_feature 返回的敏感词/触发词计数值 :param textarr: list,每个元素为str,普通文本 :return: np.array(2-d),每个元素为np.array(1-d) """ vecarr = list() for text in textarr: try: ft_vec = self.get_ft_vector(text) except: text = pu.text_normalization(text) ft_vec = self.get_ft_vector(text) ft_vec = np.append(ft_vec, self.has_keyword_feature(text)) vecarr.append(ft_vec) return np.array(vecarr)
def filter_twarr_text(twarr): """ 对输入的推特列表,对所有推特的文本进行预处理,抛弃预处理结果不合规的推特; 每条推特 tk.key_orgntext 字段保留原始文本, tk.key_text 字段保留预处理结果 :param twarr: list,推特列表 :return: list,经过文本预处理以及筛选的推特列表 """ flt_twarr = list() for tw in twarr: # TODO text_orgn = tw.get(tk.key_text, '').strip() text_orgn = tw.get(tk.key_text, '').strip() # text_orgn = tw.get(tk.key_orgntext, tw.get(tk.key_text, None)).strip() if not text_orgn: continue text_norm = pu.text_normalization(text_orgn).strip() if pu.is_empty_string(text_norm) or not pu.has_enough_alpha( text_norm, 0.65): continue tw[tk.key_orgntext] = text_orgn tw[tk.key_text] = text_norm flt_twarr.append(tw) return flt_twarr
def get_features(self, json): user = json[tk.key_user] if tk.key_description in user and user[tk.key_description] is not None: l_profile_description = len(user[tk.key_description]) else: l_profile_description = 0 FI = user[tk.key_friends_count] FE = user[tk.key_followers_count] num_tweet_posted = user[tk.key_statuses_count] tw_time = json[tk.key_created_at] user_born_time = json[tk.key_user][tk.key_created_at] # TODO 有些推文时间字段有误,需要判断处理,比如缺了分秒信息 tw_d = datetime.datetime.strptime(tw_time, '%a %b %d %H:%M:%S %z %Y') user_d = datetime.datetime.strptime(user_born_time, '%a %b %d %H:%M:%S %z %Y') time_delta = tw_d - user_d AU = time_delta.seconds / 60.0 + time_delta.days * 24 FE_FI_ratio = 0 if FI != 0: FE_FI_ratio = FE / float(FI) reputation = FE / (FI + FE) if (FI + FE) != 0 else 0 following_rate = FI / AU if AU != 0 else 0 tweets_per_day = num_tweet_posted / (AU / 24) if AU != 0 else 0 tweets_per_week = num_tweet_posted / (AU / (24 * 7)) if AU != 0 else 0 user_features = [l_profile_description, FI, FE, num_tweet_posted, AU, FE_FI_ratio, reputation, following_rate, tweets_per_day, tweets_per_week] """ content features """ if tk.key_orgntext not in json: orgn_text = json[tk.key_orgntext] = json[tk.key_text] json[tk.key_text] = pu.text_normalization(orgn_text) text = json[tk.key_text] words = text.split() num_words = len(words) num_charater = len(text) num_white_space = len(re.findall(r'(\s)', text)) num_capitalization_word = len(re.findall(r'(\b[A-Z]([a-z])*\b)', text)) num_capital_per_word = num_capitalization_word / num_words max_word_length = 0 mean_word_length = 0 for word in words: if len(word) > max_word_length: max_word_length = len(word) mean_word_length += len(word) mean_word_length /= len(words) orgn = json[tk.key_orgntext] num_exclamation_marks = orgn.count('!') num_question_marks = orgn.count('?') num_urls = len(json['entities']['urls']) num_urls_per_word = num_urls / num_words num_hashtags = len(json['entities']['hashtags']) num_hashtags_per_word = num_hashtags / num_words num_mentions = len(json['entities']['user_mentions']) num_mentions_per_word = num_mentions / num_words substrings = get_all_substrings(text) num_spam_words = 0 for sub in substrings: if sub in self.spam_words: num_spam_words += 1 num_spam_words_per_word = num_spam_words / num_words content_features = [num_words, num_charater, num_white_space, num_capitalization_word, num_capital_per_word, max_word_length, mean_word_length, num_exclamation_marks, num_question_marks, num_urls, num_urls_per_word, num_hashtags, num_hashtags_per_word, num_mentions, num_mentions_per_word, num_spam_words, num_spam_words_per_word] sentiment_frature = count_sentiment(text) chat_feature = self.gsdmm.get_GSDMM_Feature(json) total_features = list() total_features.extend(user_features) total_features.extend(content_features) total_features.append(sentiment_frature) total_features.append(chat_feature) return total_features
tmu.check_time() exit() sub_files = fi.listchildren('/home/nfs/cdong/tw/origin/', fi.TYPE_FILE, concat=True)[18:19] twarr = au.merge_array([fu.load_array(file) for file in sub_files]) print(len(twarr)) tmu.check_time(print_func=None) for idx, tw in enumerate(twarr[14000:15000]): if (idx + 1) % 1000 == 0: print(idx) try: my_filter.get_features(tw) except: # print(tw[tk.key_text]) # print(tw[tk.key_orgntext]) print('-', pu.text_normalization(tw[tk.key_orgntext])) tmu.check_time(print_func=lambda dt: print('pos filter time elapsed {}s'.format(dt))) exit() pos_base = '/home/nfs/cdong/tw/seeding/Terrorist/queried/event_corpus/' sub_files = fi.listchildren(pos_base, fi.TYPE_FILE, 'txt$', concat=True) pos_twarr = au.merge_array([fu.load_array(file) for file in sub_files]) print(len(pos_twarr)) tmu.check_time(print_func=None) pos_proba = my_filter.predict_proba(pos_twarr) tmu.check_time(print_func=lambda dt: print('pos filter time elapsed {}s'.format(dt))) neg_files = [ '/home/nfs/yying/data/crawlTwitter/Crawler1/test.json', '/home/nfs/yying/data/crawlTwitter/Crawler2/crawl2.json',
def text_normalization(text): text = pu.text_normalization(text) text = ' '.join(tokenize(text)) return text
def filter_text(text): return pu.text_normalization(text)
def __init__(self, trainning=None): self.c = ChatFilter() self.orgn_predict_label = None self.class_dist = None self.is_noise_dict = None if trainning is None: try: with open(chat_filter_file, 'rb') as f: self.c = pickle.load(f) with open(orgn_predict_label_file, 'rb') as f: self.orgn_predict_label = pickle.load(f) with open(class_dist_file, 'rb') as f: self.class_dist = pickle.load(f) with open(is_noise_dict_file, 'rb') as f: self.is_noise_dict = set(pickle.load(f)) except: print('load error') traceback.print_exc() else: # prepare data mypath = '../data/' onlyfiles = [ mypath + f for f in listdir(mypath) if isfile(join(mypath, f)) ] print(onlyfiles) twarrF = readFilesAsJsonList(onlyfiles) twarrT = trainning for idx, tw in enumerate(twarrF): twarrF[idx]['label'] = 0 for idx, tw in enumerate(twarrT): twarrT[idx]['label'] = 1 twarr = list() twarr.extend(twarrT) twarr.extend(twarrF) i = 0 for tw in twarr: tw['text'] = pu.text_normalization(tw['orgn']) twarr[i] = tw i += 1 # train self.c = ChatFilter() self.c.set_twarr(twarr) self.c.set_hyperparams( 0.9, 0.01, 55) # 推荐超参,论文里用的是alpha=0.1 * len(twarr), beta=0.02 class_dist, orgn_predict_label = self.c.recluster_using_GSDMM() try: with open(chat_filter_file, 'wb') as f: pickle.dump(self.c, f) with open(orgn_predict_label_file, 'wb') as f: pickle.dump(orgn_predict_label, f) with open(class_dist_file, 'wb') as f: pickle.dump(class_dist, f) except: print('save error') traceback.print_exc() # get isNoiseDict label = [tw['label'] for tw in twarr] table = pd.DataFrame(index=set(orgn_predict_label), columns=set(label), data=0) for i in range(len(label)): table.loc[orgn_predict_label[i], label[i]] += 1 print(table) multiple_times = 30 self.is_noise_dict = [] zero_total = float(table[0].sum()) one_total = float(table[1].sum()) for index, row in table.iterrows(): if row[1] == 0: if row[0] > multiple_times: self.is_noise_dict.append(index) else: continue elif (row[0] / zero_total) / (row[1] / one_total) > multiple_times: self.is_noise_dict.append(index) with open(is_noise_dict_file, 'wb') as f: pickle.dump(self.is_noise_dict, f)