def make_text_files(): for idx, file in enumerate(neg_2012_full_files): twarr = fu.load_array(file) txtarr = list() for tw in twarr: text = pu.text_normalization(tw[tk.key_text]) if pu.is_empty_string(text) or len(text) < 20: continue txtarr.append(text) print('len delta', len(twarr) - len(txtarr)) path = Path(file) out_file_name = '_'.join([path.parent.name, path.name]).replace('json', 'txt') out_file = ft_data_pattern.format(out_file_name) print(out_file) fu.write_lines(out_file, txtarr) return p_twarr_blocks = map(fu.load_array, pos_files) p_txtarr_blocks = map(twarr2textarr, p_twarr_blocks) p_txtarr = au.merge_array(list(p_txtarr_blocks)) p_out_file = ft_data_pattern.format('pos_2016.txt') fu.write_lines(p_out_file, p_txtarr) for f in neg_files: in_file = neg_event_pattern.format(f) out_file = ft_data_pattern.format(f.replace("json", "txt")) twarr = fu.load_array(in_file) txtarr = twarr2textarr(twarr) print(len(twarr), '->', len(txtarr), len(twarr) - len(txtarr)) fu.write_lines(out_file, txtarr)
def exec_pre_test(test_data_path): subfiles = fi.listchildren(test_data_path, children_type='file') # file_list = fu.split_multi_format( # [(test_data_path + file) for file in subfiles if file.endswith('.json')], process_num=6) # twarr_blocks = fu.multi_process(fi.summary_unzipped_tweets_multi, # [(file_list_slice,) for file_list_slice in file_list]) twarr_blocks = filter_twarr( [fu.load_array(file) for file in subfiles if file.endswith('.json')]) twarr = au.merge_array(twarr_blocks) tu.start_ner_service(pool_size=16) tu.twarr_ner(twarr) tu.end_ner_service() all_ids = set(fu.load_array(test_data_path + 'test_ids_all.csv')) pos_ids = set(fu.load_array(test_data_path + 'test_ids_pos.csv')) non_pos_ids = all_ids.difference(pos_ids) pos_twarr = list() non_pos_twarr = list() for tw in twarr: twid = tw[tk.key_id] if twid in pos_ids: pos_twarr.append(tw) elif twid in non_pos_ids: non_pos_twarr.append(tw) fu.dump_array(getcfg().pos_data_file, pos_twarr) fu.dump_array(getcfg().non_pos_data_file, non_pos_twarr)
def __init__(self, domain, product_num): self.domain = domain self.seed_product = 'B003EYVXV4' self.datahelper = DataHelper() self.userhelper = UserHelper() self.product2user = fu.load_array( os.path.join(ku.index_root, domain, 'product2user.json'))[0] self.user2product = fu.load_array( os.path.join(ku.index_root, domain, 'user2product.json'))[0] self.all_products = fu.listchildren(os.path.join( ku.product_root, domain), concat=False) self.product_num = product_num
def order_twarr_through_time(self): print("data source : normal") event_blocks = fu.load_array("./data/events2016.txt") false_event_twarr = fu.load_array("./data/false_pos_events.txt") event_blocks.append(false_event_twarr) for block_idx, block in enumerate(event_blocks): for tw in block: tw[tk.key_event_label] = block_idx twarr = au.merge_array(event_blocks) tflt.filter_twarr_dup_id(twarr) def random_idx_for_item(item_arr, dest_item): from numpy import random def sample(prob): return random.rand() < prob non_dest_item_idx = [ idx for idx in range(len(item_arr)) if item_arr[idx] not in dest_item ] dest_item_idx = [ idx for idx in range(len(item_arr)) if item_arr[idx] in dest_item ] non_dest_cnt = dest_cnt = 0 res = list() while len(non_dest_item_idx) > non_dest_cnt and len( dest_item_idx) > dest_cnt: if sample((len(dest_item_idx) - dest_cnt) / (len(dest_item_idx) - dest_cnt + len(non_dest_item_idx) - non_dest_cnt)): res.append(dest_item_idx[dest_cnt]) dest_cnt += 1 else: res.append(non_dest_item_idx[non_dest_cnt]) non_dest_cnt += 1 while len(non_dest_item_idx) > non_dest_cnt: res.append(non_dest_item_idx[non_dest_cnt]) non_dest_cnt += 1 while len(dest_item_idx) > dest_cnt: res.append(dest_item_idx[dest_cnt]) dest_cnt += 1 return res idx_time_order = tu.rearrange_idx_by_time(twarr) twarr = [twarr[idx] for idx in idx_time_order] lbarr = self.lbarr_of_twarr(twarr) idx_random_item = random_idx_for_item(lbarr, {max(lbarr)}) twarr = [twarr[idx] for idx in idx_random_item] return twarr
def load_tw_batches(self, load_cluid_arr): tw_batches = fu.load_array(self.labelled_batch_file) tu.twarr_nlp(au.merge_array(tw_batches)) print("twarr nlp over") if load_cluid_arr: cluid_batches = fu.load_array(self.cluid_batch_file) assert len(tw_batches) == len(cluid_batches) for b_idx in range(len(tw_batches)): tw_batch, cluid_batch = tw_batches[b_idx], cluid_batches[b_idx] assert len(tw_batch) == len(cluid_batch) for idx in range(len(tw_batch)): tw, cluid = tw_batch[idx], cluid_batch[idx] tw[tk.key_event_cluid] = cluid return tw_batches
def test1(): import utils.tweet_keys as tk import utils.array_utils as au import utils.pattern_utils as pu import utils.timer_utils as tmu import calling.back_extractor as bext import utils.file_iterator as fi import utils.function_utils as fu from extracting.cluster_infomation import merge_cic_list2cluid_twarr_list # C_NAME = BackCluster.G_CLASS.__name__ # _base = '/home/nfs/cdong/tw/src/calling/tmp_{}'.format(C_NAME) # fi.mkdir(_base, remove_previous=True) # _cluid_cluster_list_file = '_cluid_cluster_list_{}'.format(C_NAME) _twarr = fu.load_array("./filtered_twarr.json")[:5000] _batches = au.array_partition(_twarr, [1] * 43, random=False) _max_window_size, _full_interval = 4, 2 _alpha, _beta = 30, 0.01 start_pool(_max_window_size, _full_interval, _alpha, _beta) for idx, twarr in enumerate(_batches): input_twarr(twarr) if idx > 0 and idx % 2 == 0: execute_cluster(10) ctl = wait_get_cluid_twarr_list() print('len(ctl)={}'.format(len(ctl)) if ctl else 'ctl is None') end_pool()
def get_semantic_tokens(file_list): pos_type_info = { ark.prop_label: { K_IFD: IdFreqDict() }, ark.comm_label: { K_IFD: IdFreqDict() }, ark.verb_label: { K_IFD: IdFreqDict() }, ark.hstg_label: { K_IFD: IdFreqDict() }, } total_doc_num = 0 for file in file_list: twarr = ark.twarr_ark(fu.load_array(file)) total_doc_num += len(twarr) pos_tokens = au.merge_array([tw[tk.key_ark] for tw in twarr]) for pos_token in pos_tokens: word = pos_token[0].strip().lower() if len(word) <= 2 or not pu.is_valid_keyword(word): continue real_label = ark.pos_token2semantic_label(pos_token) if real_label: pos_type_info[real_label][K_IFD].count_word(word) return pos_type_info, total_doc_num
def main(): bflt.start_pool(flt_pool_size, ne_threshold, clf_threshold) bclu.start_pool(hold_batch_num, batch_size, alpha, beta) # bext.start_pool(ext_pool_size) sub_files = fi.listchildren("/home/nfs/cdong/tw/origin/", fi.TYPE_FILE, concat=True)[-4000:] for _idx, _file in enumerate(sub_files): _twarr = fu.load_array(_file) print("1-- {} th twarr to filter, len: {}".format(_idx, len(_twarr))) twarr2filter(_twarr) # if _idx > 0 and (_idx + 1) % 1000 == 0: # dt = tmu.check_time('if_idx>0and(_idx+1)%1000==0:', print_func=None) # emu.send_email('notification', '{}/{} file, {}s from last 1000 file'.format(_idx+1, len(sub_files), dt)) # if _idx % 50 == 0: # tmu.check_time('_idx, _file', print_func=lambda dt: print("{} s from last 50".format(dt))) if _idx > 0 and _idx % 10 != 0: continue try_filter2cluster() # cluid_twarr_list = bclu.get_cluid_twarr_list() # print(len(cluid_twarr_list) if cluid_twarr_list else '--not ready') # if cluid_twarr_list: # print(len(cluid_twarr_list)) ensure_filter_workload()
def sum_files(file_list, filter_level): res_twarr = list() for file in file_list: twarr = fu.load_twarr_from_bz2(file) if file.endswith('.bz2') else fu.load_array(file) twarr = tflt.filter_twarr(twarr, filter_level) res_twarr.extend(twarr) return res_twarr
def load_dict(self, file_name): self.clear() word_id_freq_arr = fu.load_array(file_name) for word, wid, freq in word_id_freq_arr: self._word2id[word] = {K_FREQ: int(freq), K_ID: int(wid)} self._freq_sum += freq self.calc_freq_sum() return self
def load_tw_batches(self, load_cluid_arr): temp_len = 60000 twarr = fu.load_array(self.filtered_twarr_file)[:temp_len] print("load_tw_batches, len(twarr)=", len(twarr)) if load_cluid_arr: cluidarr = fu.load_array(self.filtered_cluidarr_file)[:temp_len] assert len(twarr) == len(cluidarr) for idx in range(len(twarr)): tw, twid = twarr[idx], twarr[idx][tk.key_id] origin_id, cluid = cluidarr[idx] assert twid == origin_id tw[tk.key_event_cluid] = cluid twarr = tu.twarr_nlp(twarr) tw_batches = split_array_into_batches(twarr, self.batch_size) print("batch distrb {}, {} batches, total {} tweets".format( [len(b) for b in tw_batches], len(tw_batches), len(twarr))) return tw_batches
def identify_korea(): file = '/home/nfs/cdong/tw/seeding/NorthKorea/korea.json' twarr_blocks = fu.load_array(file) twarr = au.merge_array(twarr_blocks) for tw in twarr: text = tw[tk.key_text] if not re.search('korea', text, flags=re.I): print(text)
def query_from_files(file_list, query): res_twarr = [] for file in file_list: twarr = fu.load_array(file) for tw in twarr: if tk.key_text in tw and query.is_text_desired(tw.get( tk.key_text)): res_twarr.append(tw) return res_twarr
def multi(file): # ent_tags = {'FAC', 'GPE', 'LOC', 'ORG', 'NORP'} word_type = list() twarr = fu.load_array(file) twarr = tu.twarr_nlp(twarr) for tw in twarr: doc = tw[tk.key_spacy] for token in doc: word_type.append([token.text, token.ent_type_, token.tag_]) return word_type
def merge_events_2016(): base = '/home/nfs/cdong/tw/seeding/Terrorist/queried/event_corpus/' subs = fi.listchildren(base, fi.TYPE_FILE) twarr_list = [] for sub in subs: twarr = fu.load_array(base + sub) # twarr = tu.twarr_ner(twarr) # twarr = ark.twarr_ark(twarr) twarr_list.append(twarr) fu.dump_array('/home/nfs/cdong/tw/seeding/Terrorist/queried/event2016.txt', twarr_list)
def organise_by_user(source_root, domain): source_path = os.path.join(source_root, '{}.json'.format(domain)) source_reviews = fu.load_array(source_path) user_dict = {} for review in source_reviews: user_ID = review[ku.reviewer_ID] if user_ID in user_dict: user_dict[user_ID].append(review) else: user_dict[user_ID] = [review] return user_dict
def extract_bad_tweets_into(files, output_file): total_tw_num = 0 neg_twarr = list() for file in files: twarr = fu.load_array(file) total_tw_num += len(twarr) for tw in twarr: text = tw[tk.key_text] if len(text) < 20 or not pu.has_enough_alpha(text, 0.6): neg_twarr.append(tw) fu.dump_array(output_file, neg_twarr) return len(neg_twarr), total_tw_num
def read_tweet_from_json_file(self, file): if not self.is_file_of_query_date(file): return for tw in fu.load_array(file): tw_added = False for seed_query in self.seed_query_list: tw_added = seed_query.append_desired_tweet(tw, usingtwtime=False) or tw_added if tw_added: if tw['id'] in self.added_ids: continue else: self.added_ids.add(tw['id']) self.added_twarr.append(tw)
def main(): """ 启动各进程(池),遍历 _sub_files 中的文件名,逐个读取文件内容, 每读取一个文件,输入过滤&聚类模块,并尝试从分类器读取返回结果后输入聚类模块; 每过指定时间,向聚类模块发送聚类指令; 随后尝试从聚类模块读取返回结果,并输入聚类信息提取模块 :return: """ tmu.check_time('qwertyui') tmu.check_time('main line 116', print_func=None) bflt.start_pool(flt_pool_size, ne_threshold, clf_threshold, event_type) bclu.start_pool(max_window_size, full_interval, alpha, beta) bext.start_pool(ext_pool_size, event_type) alarm = tmu.Alarm() # _sub_files = fi.listchildren("/home/nfs/cdong/tw/origin/", fi.TYPE_FILE, concat=True)[-4000:] _sub_files = fi.listchildren( "/home/nfs/cdong/tw/seeding/Terrorist/queried/positive", fi.TYPE_FILE, concat=True) # _twarr = fu.load_array(_sub_files[0]) # _twarr = fu.change_from_lxp_format(_twarr) for _idx, _file in enumerate(_sub_files): _twarr = fu.load_array(_file) if config.using_api_format == 'False': _twarr = fu.change_from_lxp_format(_twarr) if (_idx + 1) % 1000 == 0: dt = tmu.check_time('main line 116', print_func=None) emu.send_email('file {}/{}'.format(_idx + 1, len(_sub_files)), '{}s from last 1000 file'.format(dt)) if _idx > 0 and _idx % 10 == 0: print("main: {} th twarr to filter, len: {}".format( _idx, len(_twarr))) print("{} th twarr to filter, len: {}".format(_idx, len(_twarr))) twarr2filter(_twarr) filter2cluster() if alarm.is_time_elapse_bigger_than(check_every_sec): alarm.initialize_timestamp() filter2cluster(5) bclu.execute_cluster() time.sleep(60) cluster2extractor() # time.sleep(300) end_it() tmu.check_time('qwertyui')
def refilter_twarr(in_file, out_file): twarr = fu.load_array(in_file)[:200000] origin_len = len(twarr) print(origin_len) clf_filter = ClassifierTerror() # for idx in range(len(twarr) - 1, -1, -1): # text = twarr[idx][tk.key_text] # if not pu.has_enough_alpha(text, 0.6): # print(text) # twarr.pop(idx) # text_filter_len = len(twarr) # print("delta by text =", origin_len - text_filter_len) tmu.check_time("refilter_twarr") twarr = clf_filter.filter(twarr, 0.2) tmu.check_time("refilter_twarr") print(len(twarr)) fu.dump_array(out_file, twarr[:100000])
def get_tokens(file_list): id_freq_dict, total_doc_num = IdFreqDict(), 0 for file in file_list: twarr = fu.load_array(file) total_doc_num += len(twarr) for tw in twarr: tokens = re.findall(r'[a-zA-Z_#\-]{3,}', tw[tk.key_text].lower()) real_tokens = list() for token in tokens: if len(token) >= 16: real_tokens.extend(pu.segment(token)) else: real_tokens.append(token) for token in real_tokens: if (not pu.is_stop_word(token)) and pu.has_azAZ(token) and 3 <= len(token): id_freq_dict.count_word(token) id_freq_dict.drop_words_by_condition(2) print(id_freq_dict.vocabulary_size()) return id_freq_dict, total_doc_num
def func3(): str_arr = fu.load_array('sim_info.txt') feature = list() labels = list() for string in str_arr: num_arr = [float(s) for s in re.findall('\d\.\d+|\d+', string)] # if num_arr[4] < 0.5: # continue feature.append([num_arr[1], num_arr[3], num_arr[4]]) labels.append(1 if num_arr[0] == num_arr[2] else 0) print(num_arr, feature[-1], labels[-1]) split_idx = int(len(feature) * 0.3) trainX, testX = feature[split_idx:], feature[:split_idx] trainY, testY = labels[split_idx:], labels[:split_idx] clf = svm.SVC() # clf.fit(feature, labels) # predY = clf.predict(feature) # auc = sklearn.metrics.roc_auc_score(labels, predY) clf.fit(trainX, trainY) predY = clf.predict(testX) auc = sklearn.metrics.roc_auc_score(testY, predY) print(auc) for idx in range(len(predY)): print(predY[idx], testY[idx]) precision, recall, thresholds = metrics.precision_recall_curve( testY, predY) last_idx = 0 for ref in [i / 10 for i in range(3, 8)]: for idx in range(last_idx, len(thresholds)): if thresholds[idx] >= ref: print('threshold', round(thresholds[idx], 2), '\tprecision', round(precision[idx], 5), '\trecall', round(recall[idx], 5)) last_idx = idx break
def parse_cluster_to_ordereddict(cluster, twarr_info): # cluid = cluster_info # readable_info_list, text_times, earliest_time_str, latest_time_str, hot, level, sorted_twarr = twarr_info od = OrderedDict() clu_info = OrderedDict() clu_info["id"] = 124386342 clu_info["level"] = "{}({})".format(2, 'just soso') clu_info["hot"] = 1234 geo_list = [ ["japan", "12", "32"], ["bangkok", "523", "435"], ["italy", "1234", "431"], ] geo_infer = array2ordereddict(geo_list, ['name', 'lat', 'lng'], "geo_") time_list = [ ["20160234 12:23:73", "nimabi"], ["20179212 32:56:89", "tomorrow"], ] time_infer = OrderedDict() time_infer["earliest_time"] = "20180902 12:23:21" time_infer["latest_time"] = "20180902 12:23:21" time_text = array2ordereddict(time_list, ["inferred", "text"], "time_") time_infer.update(time_text) tw_file = "/home/nfs/cdong/tw/seeding/Terrorist/queried/event_corpus/2016-01-29_attack_Dalori.json" twarr = fu.load_array(tw_file)[:10] jsonarr = [[json.dumps(tw)[50:70]] for tw in twarr] tweet_list = array2ordereddict(jsonarr, row_prefix="tweet_") od['cluster_info'] = clu_info od['inferred_geo'] = geo_infer od['inferred_time'] = time_infer od['sorted_twarr'] = tweet_list od = {'cluster': od} return od
def _load_user_reviews(self, user): # 获取某个user, 某个domain 下的reviews ''' :param file: :param min_threshold: 用户发表的最小评论数 :param max_threshold: 用户发表的最大评论数 :return: ''' res = [] file = os.path.join(self.root, self.domain, user) with open(file) as f: reviews = f.readlines() if self.min_threshold is not None and self.max_threshold is not None: for review in reviews: review = json.loads(review) if self.min_threshold <= review[ ku.reviewer_count] <= self.max_threshold: res.append(review) elif self.min_threshold is not None: for review in reviews: review = json.loads(review) if review[ku.reviewer_count] >= self.min_threshold: res.append(review) elif self.max_threshold is not None: for review in reviews: review = json.loads(review) if review[ku.reviewer_count] <= self.max_threshold: res.append(review) else: res = fu.load_array(file) if self.num_reviews_per_user is not None and len( res) > self.num_reviews_per_user: res = res[:self.num_reviews_per_user] if self.shuffle: res = sku.shuffle(res) return res
def get_reviews(): file = r'/home/leeyang/research/data/Movie.json' reviews = fu.load_array(file) return reviews
def func2(): file = '/home/nfs/cdong/tw/src/clustering/data/events.txt' twarr_blocks = fu.load_array(file) for tw in twarr_blocks[19]: print(tw[tk.key_text])
import utils.tweet_keys as tk import utils.array_utils as au import utils.pattern_utils as pu import utils.timer_utils as tmu import calling.back_extractor as bext import utils.file_iterator as fi import utils.function_utils as fu fi.mkdir('/home/nfs/cdong/tw/src/calling/tmp', remove_previous=True) tmu.check_time() _hold_batch_num = 100 _batch_size = 100 _alpha, _beta = 30, 0.01 # _alpha, _beta = 50, 0.005 _file = "./filtered_twarr.json" _twarr = fu.load_array(_file)[:10200] start_pool(_hold_batch_num, _batch_size, _alpha, _beta) input_twarr_batch(_twarr) print('---> waiting for _cluid_cluster_list') while True: _cluid_cluster_list = cluster_daemon.outq2.get() print(' - some thing returned, type :{}'.format( type(_cluid_cluster_list))) if _cluid_cluster_list is not None: break print('---> get _cluid_cluster_list, len:{}'.format( len(_cluid_cluster_list))) _ext_pool_size = 10 bext.start_pool(_ext_pool_size)
print(i, j) # import re # def preprocess(doc): # # pattern = re.compile(r'(\d\s\.\s\d)') # return re.sub(r'(\d\s\.\s\d)', '.', doc) # # for text in textarr[100:300]: # print(preprocess(text)) import sys sys.path.append('../utils') import utils.function_utils as fu twarr = fu.load_array( '/home/nfs/cdong/tw/seeding/NaturalDisaster/queried/NaturalDisaster.sum') arr1 = twarr[:2000] arr2 = twarr[2000:] # cv = CV(analyzer='word', token_pattern=r'([a-zA-Z_-]+|\d+\.\d+|\d+)', # stop_words=stop_words, max_df=0.8, min_df=1e-5) import re print(re.findall(r'([a-zA-Z_-]+|\d+\.\d+|\d+)', )) from nltk.corpus import stopwords stop_words = set(stopwords.words('english')) import re # s = 'RT @bugwannostra: @Louuu_ thx #FFFFs People power -_- works ❤signing… https://t.co/pl2bquE5Az'
] def get_quality_autophrase(process_code, textarr, conf_thres, len_thres): conf_word_list = autophrase_wrapper(process_code, textarr) return filter_keywords(conf_word_list, conf_thres, len_thres) if __name__ == '__main__': import utils.tweet_utils as tu # text_file = "/home/nfs/cdong/tw/src/extracting/3796_r.txt" # textarr = fu.read_lines(text_file) twarr_file = "/home/nfs/cdong/tw/seeding/Terrorist/queried/positive/2016-01-11_blast_Istanbul.json" twarr_file = "/home/nfs/cdong/tw/seeding/Terrorist/queried/positive/2016-03-26_suicide-bomb_Lahore.json" twarr = fu.load_array(twarr_file) textarr = [tw[tk.key_text] for tw in twarr] _conf_word_list = autophrase_wrapper(0, textarr) # _keywords = [item[1] for item in _conf_word_list] print(filter_keywords(_conf_word_list, 50)) print('\n') print(textarr) # idx_groups = tu.group_textarr_similar_index(keywords, 0.2) # for g in idx_groups: # print([keywords[i] for i in g], '\n') # print(_conf_word_list[:30]) # print() # print(textarr) exit() """ 文本数量小于30时关键词的质量已经相当低,应尽量使进入的文本数量大于一定阈值 """ """ __main__里面的内容保持不变,是最终的接口形式 """
def merge_events_2016(): base = '/home/nfs/cdong/tw/seeding/Terrorist/queried/event_corpus/' subs = fi.listchildren(base, fi.TYPE_FILE) twarr_list = [] for sub in subs: twarr = fu.load_array(base + sub) # twarr = tu.twarr_ner(twarr) # twarr = ark.twarr_ark(twarr) twarr_list.append(twarr) fu.dump_array('/home/nfs/cdong/tw/seeding/Terrorist/queried/event2016.txt', twarr_list) if __name__ == '__main__': # merge_events_2016() import utils.pattern_utils as pu base = "/home/nfs/cdong/tw/seeding/Terrorist/queried/event_corpus/" files = fi.listchildren(base, fi.TYPE_FILE, concat=True) for file in files: twarr = fu.load_array(file) len_pre = len(twarr) for idx in range(len(twarr) - 1, -1, -1): text = twarr[idx][tk.key_text] if not pu.has_enough_alpha(text, 0.6): print(text) twarr.pop(idx) print(len_pre, '->', len(twarr), '\n\n') # fu.dump_array(file, twarr)