def make_train_test(): p_file = ft_data_pattern.format("pos_2016.txt") n_bad_files = fi.listchildren(ft_data_pattern.format(''), fi.TYPE_FILE, concat=True, pattern='2016_bad') n_2017_files = fi.listchildren(ft_data_pattern.format(''), fi.TYPE_FILE, concat=True, pattern='2017') # n_2012_fulls = fi.listchildren(ft_data_pattern.format(''), fi.TYPE_FILE, concat=True, pattern='2012_full')[:12] n_2012_fulls = fi.listchildren(ft_data_pattern.format(''), fi.TYPE_FILE, concat=True, pattern='2012_full') n_2016_files = fi.listchildren(ft_data_pattern.format(''), fi.TYPE_FILE, concat=True, pattern='2016_queried') print(len(n_bad_files), len(n_2017_files), len(n_2012_fulls), len(n_2016_files)) n_files = n_bad_files + n_2017_files + n_2012_fulls + n_2016_files p_txtarr = fu.read_lines(p_file) p_prefix_txtarr = prefix_textarr(label_t, p_txtarr) n_txtarr_blocks = [fu.read_lines(file) for file in n_files] n_prefix_txtarr_blocks = [ prefix_textarr(label_f, txtarr) for txtarr in n_txtarr_blocks ] train_test = list() bad = len(n_bad_files) bad_blocks, n_blocks = n_prefix_txtarr_blocks[: bad], n_prefix_txtarr_blocks[ bad:] train_test.append(split_train_test(p_prefix_txtarr)) train_test.extend([split_train_test(block) for block in n_blocks]) print("len(train_test)", len(train_test)) train_list, test_list = zip(*train_test) train_list = list(train_list) + bad_blocks train_txtarr = au.merge_array(train_list) test_txtarr = au.merge_array(test_list) fu.write_lines(fasttext_train, train_txtarr) fu.write_lines(fasttext_test, test_txtarr) print("len(train_list)", len(train_list), "len(train_txtarr)", len(train_txtarr), "len(test_txtarr)", len(test_txtarr))
def get_semantic_tokens_multi(file_path): pos_type_info = { ark.prop_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_prop_dict_file}, ark.comm_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_comm_dict_file}, ark.verb_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_verb_dict_file}, ark.hstg_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_hstg_dict_file}, } total_doc_num = 0 file_path = fi.add_sep_if_needed(file_path) # subfiles = au.random_array_items(fi.listchildren(file_path, children_type=fi.TYPE_FILE), 40) subfiles = fi.listchildren(file_path, children_type=fi.TYPE_FILE) file_list_block = mu.split_multi_format([(file_path + subfile) for subfile in subfiles], process_num=20) res_list = mu.multi_process(get_semantic_tokens, [(file_list,) for file_list in file_list_block]) for res_type_info, doc_num in res_list: total_doc_num += doc_num for label in res_type_info.keys(): pos_type_info[label][K_IFD].merge_freq_from(res_type_info[label][K_IFD]) print('total_doc_num', total_doc_num) for label in pos_type_info.keys(): ifd, file_name = pos_type_info[label][K_IFD], pos_type_info[label][K_FILE] ifd.drop_words_by_condition(3) if label != ark.hstg_label: ifd.drop_words_by_condition(lambda word, _: word.startswith('#')) ifd.dump_dict(file_name) print('{}; vocab size:{}'.format(file_name, ifd.vocabulary_size()))
def parse_query_list(from_path, into_path, query_list, n_process): from_path = fi.add_sep_if_needed(from_path) into_path = fi.add_sep_if_needed(into_path) all_sub_files = [ file for file in fi.listchildren( from_path, children_type=fi.TYPE_FILE, pattern='.sum$') ] tw_num_sum = 0 for query in query_list: query = SeedQuery(*query) query_sub_files = [ os.path.join(from_path, f) for f in all_sub_files if query.is_time_desired( tw_ymd=query.time_of_tweet(f, source='filename')) ] print('{} files from {} to {}'.format( len(query_sub_files), query_sub_files[0][query_sub_files[0].rfind('/') + 1:], query_sub_files[-1][query_sub_files[-1].rfind('/') + 1:], )) twarr = query_from_files_multi(query_sub_files, query, n_process) tw_num_sum += len(twarr) file_name = query.to_string() + '.json' if len(twarr) > 20: print('file {} written\n'.format(file_name)) fu.dump_array(os.path.join(into_path, file_name), twarr) else: print('twarr not long enough') for tw in twarr: print(tw[tk.key_text], '\n') print('total tweet number: {}'.format(tw_num_sum))
def exec_pre_test(test_data_path): subfiles = fi.listchildren(test_data_path, children_type='file') # file_list = fu.split_multi_format( # [(test_data_path + file) for file in subfiles if file.endswith('.json')], process_num=6) # twarr_blocks = fu.multi_process(fi.summary_unzipped_tweets_multi, # [(file_list_slice,) for file_list_slice in file_list]) twarr_blocks = filter_twarr( [fu.load_array(file) for file in subfiles if file.endswith('.json')]) twarr = au.merge_array(twarr_blocks) tu.start_ner_service(pool_size=16) tu.twarr_ner(twarr) tu.end_ner_service() all_ids = set(fu.load_array(test_data_path + 'test_ids_all.csv')) pos_ids = set(fu.load_array(test_data_path + 'test_ids_pos.csv')) non_pos_ids = all_ids.difference(pos_ids) pos_twarr = list() non_pos_twarr = list() for tw in twarr: twid = tw[tk.key_id] if twid in pos_ids: pos_twarr.append(tw) elif twid in non_pos_ids: non_pos_twarr.append(tw) fu.dump_array(getcfg().pos_data_file, pos_twarr) fu.dump_array(getcfg().non_pos_data_file, non_pos_twarr)
def main(): bflt.start_pool(flt_pool_size, ne_threshold, clf_threshold) bclu.start_pool(hold_batch_num, batch_size, alpha, beta) # bext.start_pool(ext_pool_size) sub_files = fi.listchildren("/home/nfs/cdong/tw/origin/", fi.TYPE_FILE, concat=True)[-4000:] for _idx, _file in enumerate(sub_files): _twarr = fu.load_array(_file) print("1-- {} th twarr to filter, len: {}".format(_idx, len(_twarr))) twarr2filter(_twarr) # if _idx > 0 and (_idx + 1) % 1000 == 0: # dt = tmu.check_time('if_idx>0and(_idx+1)%1000==0:', print_func=None) # emu.send_email('notification', '{}/{} file, {}s from last 1000 file'.format(_idx+1, len(sub_files), dt)) # if _idx % 50 == 0: # tmu.check_time('_idx, _file', print_func=lambda dt: print("{} s from last 50".format(dt))) if _idx > 0 and _idx % 10 != 0: continue try_filter2cluster() # cluid_twarr_list = bclu.get_cluid_twarr_list() # print(len(cluid_twarr_list) if cluid_twarr_list else '--not ready') # if cluid_twarr_list: # print(len(cluid_twarr_list)) ensure_filter_workload()
def merge_events_2016(): base = '/home/nfs/cdong/tw/seeding/Terrorist/queried/event_corpus/' subs = fi.listchildren(base, fi.TYPE_FILE) twarr_list = [] for sub in subs: twarr = fu.load_array(base + sub) # twarr = tu.twarr_ner(twarr) # twarr = ark.twarr_ark(twarr) twarr_list.append(twarr) fu.dump_array('/home/nfs/cdong/tw/seeding/Terrorist/queried/event2016.txt', twarr_list)
def make_neg_event_bad_text_2016(): files = fi.listchildren("/home/nfs/cdong/tw/origin/", fi.TYPE_FILE, concat=True) files_blocks = mu.split_multi_format(files, 4) output_file = neg_event_pattern.format("neg_2016_bad_text_{}.json") args_list = [(block, output_file.format(idx)) for idx, block in enumerate(files_blocks)] res_list = mu.multi_process(extract_bad_tweets_into, args_list) n_num_list, tw_num_list = zip(*res_list) total_n, total_tw = sum(n_num_list), sum(tw_num_list) print(n_num_list, tw_num_list, total_n, total_tw, round(total_n / total_tw, 6))
def rename_files_2016(): sub_files = fi.listchildren(event_2016_pattern.format(''), fi.TYPE_FILE) for origin in sub_files: splits = origin.split('_') date = splits[0] date_splits = date.split('-') date_splits_ = ['{:0>2}'.format(s) for s in date_splits] date_ = '-'.join(date_splits_) splits[0] = date_ name_ = '_'.join(splits) from_name = event_2016_pattern.format(origin) to_name = event_2016_pattern.format(name_) os.popen('mv {} {}'.format(from_name, to_name)).close()
def get_tokens_multi(file_path): file_path = fi.add_sep_if_needed(file_path) # subfiles = au.random_array_items(fi.listchildren(file_path, children_type=fi.TYPE_FILE), 20) subfiles = fi.listchildren(file_path, children_type=fi.TYPE_FILE) file_list_block = mu.split_multi_format([(file_path + subfile) for subfile in subfiles], process_num=20) res_list = mu.multi_process(get_tokens, [(file_list,) for file_list in file_list_block]) id_freq_dict, total_doc_num = IdFreqDict(), 0 for ifd, doc_num in res_list: total_doc_num += doc_num id_freq_dict.merge_freq_from(ifd) print('total_doc_num', total_doc_num, 'total vocabulary_size', id_freq_dict.vocabulary_size()) id_freq_dict.drop_words_by_condition(3) id_freq_dict.dump_dict(getcfg().post_dict_file)
def summary_files_in_path_into_blocks(from_path, into_path, file_name): from_path = fi.add_sep_if_needed(from_path) sub_files = fi.listchildren(from_path, children_type=fi.TYPE_FILE, pattern='.json$') into_file = fi.add_sep_if_needed(into_path) + file_name twarr_block = list() for idx, file in enumerate(sub_files): from_file = from_path + file twarr = fu.load_array_catch(from_file) if len(twarr) <= 0: continue twarr = tflt.filter_twarr(twarr, tflt.FILTER_LEVEL_HIGH) twarr_block.append(twarr) print(sorted([('id'+str(idx), len(twarr)) for idx, twarr in enumerate(twarr_block)], key=lambda x: x[1])) print('event number in total: {}'.format(len(twarr_block))) fu.dump_array(into_file, twarr_block)
def summary_files_in_path(from_path, into_path=None): """ Read all .json under file_path, extract tweets from them into a file under summary_path. """ # [-13:]--hour [-13:-3]--day [-13:-5]--month,ymdh refers to the short of "year-month-date-hour" from_path = fi.add_sep_if_needed(from_path) file_ymdh_arr = pu.split_digit_arr(fi.get_parent_path(from_path)[-13:]) if not is_target_ymdh(file_ymdh_arr): return into_file = '{}{}'.format(fi.add_sep_if_needed(into_path), '_'.join(file_ymdh_arr) + '.sum') fi.remove_file(into_file) subfiles = fi.listchildren(from_path, children_type=fi.TYPE_FILE) file_block = mu.split_multi_format([(from_path + subfile) for subfile in subfiles], process_num=20) twarr_blocks = mu.multi_process(sum_files, [(file_list, tflt.FILTER_LEVEL_LOW) for file_list in file_block]) twarr = au.merge_array(twarr_blocks) if twarr: fu.dump_array(into_file, twarr, overwrite=True)
def main(): """ 启动各进程(池),遍历 _sub_files 中的文件名,逐个读取文件内容, 每读取一个文件,输入过滤&聚类模块,并尝试从分类器读取返回结果后输入聚类模块; 每过指定时间,向聚类模块发送聚类指令; 随后尝试从聚类模块读取返回结果,并输入聚类信息提取模块 :return: """ tmu.check_time('qwertyui') tmu.check_time('main line 116', print_func=None) bflt.start_pool(flt_pool_size, ne_threshold, clf_threshold, event_type) bclu.start_pool(max_window_size, full_interval, alpha, beta) bext.start_pool(ext_pool_size, event_type) alarm = tmu.Alarm() # _sub_files = fi.listchildren("/home/nfs/cdong/tw/origin/", fi.TYPE_FILE, concat=True)[-4000:] _sub_files = fi.listchildren( "/home/nfs/cdong/tw/seeding/Terrorist/queried/positive", fi.TYPE_FILE, concat=True) # _twarr = fu.load_array(_sub_files[0]) # _twarr = fu.change_from_lxp_format(_twarr) for _idx, _file in enumerate(_sub_files): _twarr = fu.load_array(_file) if config.using_api_format == 'False': _twarr = fu.change_from_lxp_format(_twarr) if (_idx + 1) % 1000 == 0: dt = tmu.check_time('main line 116', print_func=None) emu.send_email('file {}/{}'.format(_idx + 1, len(_sub_files)), '{}s from last 1000 file'.format(dt)) if _idx > 0 and _idx % 10 == 0: print("main: {} th twarr to filter, len: {}".format( _idx, len(_twarr))) print("{} th twarr to filter, len: {}".format(_idx, len(_twarr))) twarr2filter(_twarr) filter2cluster() if alarm.is_time_elapse_bigger_than(check_every_sec): alarm.initialize_timestamp() filter2cluster(5) bclu.execute_cluster() time.sleep(60) cluster2extractor() # time.sleep(300) end_it() tmu.check_time('qwertyui')
cluster_file = fi.join(path, '{}_cluid:{}.json'.format(idx, cluid)) fu.write_lines(cluster_file, [json_str]) # textarr_file = fi.join(path, '{}_text.json'.format(idx)) # textarr = [tw[tk.key_text] for tw in cic.twarr] # fu.write_lines(textarr_file, textarr) print(' bext: output into files over') def cic_format(cic_list): cic_list = sorted(cic_list, key=lambda item: len(item.twarr), reverse=True) res = [] print(' bext: output cic list, len={}'.format(len(cic_list))) for idx, cic in enumerate(cic_list): cic.twarr = ClusterInfoGetter.group_similar_tweets(cic.twarr, process_num=10) od = cic.construct_od() res.append(od) return res if __name__ == '__main__': dir = "/home/nfs/yangl/merge/lxp_data" # dir2 = "/home/nfs/cdong/tw/seeding/Terrorist/queried/positive" files = fi.listchildren(dir, concat=True)[:2] getter = ClusterInfoGetter(cci.event_t) for idx, file in enumerate(files): twarr = fu.load_array(file) twarr = fu.change_from_lxp_format(twarr) getter.cluid_twarr2cic(idx, twarr, 1)
def merge_events_2016(): base = '/home/nfs/cdong/tw/seeding/Terrorist/queried/event_corpus/' subs = fi.listchildren(base, fi.TYPE_FILE) twarr_list = [] for sub in subs: twarr = fu.load_array(base + sub) # twarr = tu.twarr_ner(twarr) # twarr = ark.twarr_ark(twarr) twarr_list.append(twarr) fu.dump_array('/home/nfs/cdong/tw/seeding/Terrorist/queried/event2016.txt', twarr_list) if __name__ == '__main__': # merge_events_2016() import utils.pattern_utils as pu base = "/home/nfs/cdong/tw/seeding/Terrorist/queried/event_corpus/" files = fi.listchildren(base, fi.TYPE_FILE, concat=True) for file in files: twarr = fu.load_array(file) len_pre = len(twarr) for idx in range(len(twarr) - 1, -1, -1): text = twarr[idx][tk.key_text] if not pu.has_enough_alpha(text, 0.6): print(text) twarr.pop(idx) print(len_pre, '->', len(twarr), '\n\n') # fu.dump_array(file, twarr)
# if earliest_time is None: # earliest_time = now # if latest_time is None: # latest_time = now return earliest_time, latest_time # if __name__ == '__main__': # test_case = u'I need a desk for tomorrow from 2pm to 3pm' # print(json.dumps(sutime.parse(test_case), sort_keys=True, indent=4)) if __name__ == '__main__': import utils.function_utils as fu import utils.file_iterator as fi base = '/home/nfs/cdong/tw/seeding/Terrorist/queried/positive' pos_files = fi.listchildren(base, concat=True) for file in pos_files: twarr = fu.load_array(file) text_times, utc_time = get_text_time(twarr) earliest_time, latest_time = get_earlist_latest_post_time(twarr) print(earliest_time.isoformat()) print(latest_time.isoformat()) print(utc_time.isoformat()) table = PrettyTable(["推测时间", "推文文本", "时间词", "推文创建时间", "utc_offset"]) table.padding_width = 1 for time in text_times: table.add_row(time) print(table)
extract_sub_process.set_input(END_PROCESS) extract_sub_process.get_output() def input_cluid_twarr_list(cluid_twarr_list): """ 向子进程输入要求其处理的聚类列表,不等待返回结果 :param cluid_twarr_list: list,每个元素为tuple 见 clustering.gsdpmm.gsdpmm_stream_ifd_dynamic.GSDPMMStreamIFDDynamic#get_cluid_twarr_list :return: """ if cluid_twarr_list: extract_sub_process.set_input(INPUT_LIST) extract_sub_process.set_input(cluid_twarr_list) if __name__ == '__main__': import utils.timer_utils as tmu _base = "/home/nfs/cdong/tw/seeding/Terrorist/queried/positive" files = fi.listchildren(_base, concat=True) _cluid_twarr_list = [(idx, fu.load_array(file)[:1000]) for idx, file in enumerate(files)] start_pool(10, 'terrorist_attack') tmu.check_time() for i in range(2): input_cluid_twarr_list(_cluid_twarr_list) end_pool() tmu.check_time() exit()
plt.plot(X, Y, color="blue", linewidth=1) plt.xlim([-0.03, 1.03]) plt.ylim([-0.03, 1.03]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.title("roc curve") plt.legend(loc='lower right') plt.savefig(fig_name, format='png') if __name__ == '__main__': import re import numpy as np from pathlib import Path import utils.file_iterator as fi files = fi.listchildren('/home/nfs/cdong/tw/testdata/output2', children_type=fi.TYPE_DIR, concat=True) ls = list() for f in files: p = Path(f) s = p.stat() digits = re.findall('\d+', p.name) cluid, clunum = list(map(int, digits)) ls.append((cluid, clunum, s.st_mtime)) ls = sorted(ls, key=lambda item: item[0]) print(ls) dt = [(ls[0][0], ls[0][1], 0)] + [(ls[i][0], ls[i][1], int(ls[i][2] - ls[i - 1][2])) for i in range(1, len(ls))] print(dt) # x, y_cnum, y_dt = list(zip(*dt[:50])) x, y_cnum, y_dt = list(zip(*dt))
outq.put([idx, len(twarr)]) def read2(idx, file, nothing='p'): twarr = fu.load_array(file) return [idx, len(twarr)] if __name__ == '__main__': # dp = CustomDaemonPool() # dp = ProxyDaemonPool() # dp.set_parameters(read2, 8) # base = '/home/nfs/cdong/tw/testdata/yying/2016_04/' # files = [base + sub for sub in subs][:40] base = '/home/nfs/cdong/tw/seeding/Terrorist/queried/event_corpus/' subs = fi.listchildren(base, children_type=fi.TYPE_FILE) files = [base + sub for sub in subs] tmu.check_time() res = multi_process_batch(read2, args_list=[(idx, file) for idx, file in enumerate(files)]) # dp.set_batch_input([(idx, file) for idx, file in enumerate(files)], # [{'nothing': str(idx)+file} for idx, file in enumerate(files)]) # res = dp.get_batch_output() # print(sum(([length for idx, length in res]))) # print(res) tmu.check_time() print([[idx, len(fu.load_array(file))] for idx, file in enumerate(files)]) tmu.check_time()
# print('总数:', len(predict), '过滤比例:', table.loc["data"]['被过滤'] / len(predict)) def perfomance_analysis(): labal, proba = fu.load_array('label_proba') print(len(labal), len(proba)) au.precision_recall_threshold(labal, proba) if __name__ == '__main__': from calling.back_filter import filter_twarr_text # from classifying.terror.classifier_terror import file2label_text_array # textarr, labelarr = file2label_text_array("/home/nfs/cdong/tw/seeding/Terrorist/data/test") pos_base = "/home/nfs/cdong/tw/seeding/Terrorist/queried/positive" neg_base = "/home/nfs/cdong/tw/seeding/Terrorist/queried/negative" pos_files, neg_files = fi.listchildren(pos_base, concat=True), fi.listchildren(neg_base, concat=True, pattern='2012') base = "/home/nfs/yangl/event_detection/testdata/event2012/relevant" pos_files = fi.listchildren(base, concat=True) print(len(pos_files)) print(sum([len(fu.read_lines(f)) for f in pos_files])) exit() my_filter = EffectCheck() pos_probarr, neg_probarr = list(), list() tmu.check_time() for file in neg_files: twarr = filter_twarr_text(fu.load_array(file)) probarr = my_filter.predict_proba(twarr) neg_probarr.extend(probarr) tmu.check_time()
text = pu.text_normalization(text) if pu.is_empty_string(text): continue textarr.append(text) return textarr def split_train_test(array): split = int(len(array) * 0.8) return array[:split], array[split:] pos_event_pattern = '/home/nfs/cdong/tw/seeding/Terrorist/queried/positive/{}' neg_event_pattern = '/home/nfs/cdong/tw/seeding/Terrorist/queried/negative/{}' pos_files = fi.listchildren(pos_event_pattern.format(''), fi.TYPE_FILE, concat=True) neg_files = fi.listchildren(neg_event_pattern.format(''), fi.TYPE_FILE, concat=True) """ -------- for fasttext -------- """ label_t, label_f = ftu.label_t, ftu.label_f ft_data_pattern = "/home/nfs/cdong/tw/seeding/Terrorist/data/fasttext/{}" fasttext_train = ft_data_pattern.format("train") fasttext_test = ft_data_pattern.format("test") neg_2012_full_pattern = "/home/nfs/cdong/tw/seeding/Terrorist/queried/negative/neg_2012_full_1/{}" neg_2012_full_files = fi.listchildren(neg_2012_full_pattern.format(''), concat=True)