def main(args): input_base = getcfg().origin_path output_base = '/home/nfs/cdong/tw/seeding/Terrorist/queried/positive/' import utils.timer_utils as tmu tmu.check_time() parse_query_list(input_base, output_base, seed_queries, n_process=15) tmu.check_time() return
def main(): input_base = '/home/nfs/cdong/tw/origin/' # output_base = '/home/nfs/cdong/tw/seeding/Terrorist/queried/positive/' output_base = '/home/nfs/cdong/tw/seeding/NaturalDisaster/positive' import utils.timer_utils as tmu tmu.check_time() parse_query_list(input_base, output_base, seed_queries, process_num=20) tmu.check_time() return
def refilter_twarr(in_file, out_file): twarr = fu.load_array(in_file)[:200000] origin_len = len(twarr) print(origin_len) clf_filter = ClassifierTerror() # for idx in range(len(twarr) - 1, -1, -1): # text = twarr[idx][tk.key_text] # if not pu.has_enough_alpha(text, 0.6): # print(text) # twarr.pop(idx) # text_filter_len = len(twarr) # print("delta by text =", origin_len - text_filter_len) tmu.check_time("refilter_twarr") twarr = clf_filter.filter(twarr, 0.2) tmu.check_time("refilter_twarr") print(len(twarr)) fu.dump_array(out_file, twarr[:100000])
def main(): """ 启动各进程(池),遍历 _sub_files 中的文件名,逐个读取文件内容, 每读取一个文件,输入过滤&聚类模块,并尝试从分类器读取返回结果后输入聚类模块; 每过指定时间,向聚类模块发送聚类指令; 随后尝试从聚类模块读取返回结果,并输入聚类信息提取模块 :return: """ tmu.check_time('qwertyui') tmu.check_time('main line 116', print_func=None) bflt.start_pool(flt_pool_size, ne_threshold, clf_threshold, event_type) bclu.start_pool(max_window_size, full_interval, alpha, beta) bext.start_pool(ext_pool_size, event_type) last_check_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print('sleeping.......') time.sleep(time_interval) _twarr, new_time = dbu.read_after_last_check(dbu.tweet_db, dbu.lxp_dataset, last_check_time) _twarr = fu.change_from_lxp_format(_twarr) while (True): alarm = tmu.Alarm() print('main: {}th twarr to filter. '.format(len(_twarr))) # _sub_files = fi.listchildren("/home/nfs/cdong/tw/origin/", fi.TYPE_FILE, concat=True)[-4000:] # positive_twarr = fu.load_array('/home/nfs/yangl/dc/calling/filtered_twarr.json')[:5000] # _sub_files = fi.listchildren("/home/nfs/yangl/dc/input", fi.TYPE_FILE, concat=True) # if (_idx + 1) % 1000 == 0: # dt = tmu.check_time('main line 116', print_func=None) # emu.send_email('file {}/{}'.format(_idx + 1, len(_sub_files)), '{}s from last 1000 file'.format(dt)) # if _idx > 0 and _idx % 10 == 0: # print("main: {} th twarr to filter, len: {}".format(_idx, len(_twarr))) tmu.check_time('start running') twarr2filter(_twarr) filter2cluster() if alarm.is_time_elapse_bigger_than(check_every_sec): alarm.initialize_timestamp() filter2cluster(5) bclu.execute_cluster() cluster2extractor() end_it() time_left = time_interval - tmu.check_time( 'start running', print_func=lambda dt: print('time epoch {}s'.format(dt))) print('sleeping.......') time.sleep(time_left if time_left > 0 else 0) _twarr, new_time = dbu.read_after_last_check(dbu.tweet_db, dbu.lxp_dataset, new_time)
def main(): """ 启动各进程(池),遍历 _sub_files 中的文件名,逐个读取文件内容, 每读取一个文件,输入过滤&聚类模块,并尝试从分类器读取返回结果后输入聚类模块; 每过指定时间,向聚类模块发送聚类指令; 随后尝试从聚类模块读取返回结果,并输入聚类信息提取模块 :return: """ tmu.check_time('qwertyui') tmu.check_time('main line 116', print_func=None) bflt.start_pool(flt_pool_size, ne_threshold, clf_threshold, event_type) bclu.start_pool(max_window_size, full_interval, alpha, beta) bext.start_pool(ext_pool_size, event_type) alarm = tmu.Alarm() # _sub_files = fi.listchildren("/home/nfs/cdong/tw/origin/", fi.TYPE_FILE, concat=True)[-4000:] # positive_twarr = fu.load_array('/home/nfs/yangl/dc/calling/filtered_twarr.json')[:5000] # _sub_files = fi.listchildren("/home/nfs/yangl/merge/lxp_data", fi.TYPE_FILE, concat=True) # _twarr = fu.load_array(_sub_files[0]) # _twarr = fu.change_from_lxp_format(_twarr) last_check_time = datetime.datetime.now() count = 0 while True: time.sleep(5 * 60) _twarr, new_time = dbu.read_after_last_check(dbu.nd_db, dbu.nd, last_check_time) print('*************len(_twarr){}**********'.format(len(_twarr))) if len(_twarr) == 0: print('no new data arrive....') break if config.using_api_format == 'False': _twarr = fu.change_from_lxp_format(_twarr) # if (_idx + 1) % 1000 == 0: # dt = tmu.check_time('main line 116', print_func=None) # emu.send_email('file {}/{}'.format(_idx + 1, len(_sub_files)), '{}s from last 1000 file'.format(dt)) # if _idx > 0 and _idx % 10 == 0: # print("main: {} th twarr to filter, len: {}".format(_idx, len(_twarr))) print('main: {} tweets to filter'.format(len(_twarr))) count += len(_twarr) twarr2filter(_twarr) filter2cluster() if alarm.is_time_elapse_bigger_than(check_every_sec): alarm.initialize_timestamp() filter2cluster(5) bclu.execute_cluster() time.sleep(20) cluster2extractor() last_check_time = new_time print('waiting for main process ending......') time.sleep(600) end_it() tmu.check_time('qwertyui')
def main(): """ 启动各进程(池),遍历 _sub_files 中的文件名,逐个读取文件内容, 每读取一个文件,输入过滤&聚类模块,并尝试从分类器读取返回结果后输入聚类模块; 每过指定时间,向聚类模块发送聚类指令; 随后尝试从聚类模块读取返回结果,并输入聚类信息提取模块 :return: """ tmu.check_time('qwertyui') tmu.check_time('main line 116', print_func=None) bflt.start_pool(flt_pool_size, ne_threshold, clf_threshold, event_type) bclu.start_pool(max_window_size, full_interval, alpha, beta) bext.start_pool(ext_pool_size, event_type) alarm = tmu.Alarm() # _sub_files = fi.listchildren("/home/nfs/cdong/tw/origin/", fi.TYPE_FILE, concat=True)[-4000:] _sub_files = fi.listchildren( "/home/nfs/cdong/tw/seeding/Terrorist/queried/positive", fi.TYPE_FILE, concat=True) # _twarr = fu.load_array(_sub_files[0]) # _twarr = fu.change_from_lxp_format(_twarr) for _idx, _file in enumerate(_sub_files): _twarr = fu.load_array(_file) if config.using_api_format == 'False': _twarr = fu.change_from_lxp_format(_twarr) if (_idx + 1) % 1000 == 0: dt = tmu.check_time('main line 116', print_func=None) emu.send_email('file {}/{}'.format(_idx + 1, len(_sub_files)), '{}s from last 1000 file'.format(dt)) if _idx > 0 and _idx % 10 == 0: print("main: {} th twarr to filter, len: {}".format( _idx, len(_twarr))) print("{} th twarr to filter, len: {}".format(_idx, len(_twarr))) twarr2filter(_twarr) filter2cluster() if alarm.is_time_elapse_bigger_than(check_every_sec): alarm.initialize_timestamp() filter2cluster(5) bclu.execute_cluster() time.sleep(60) cluster2extractor() # time.sleep(300) end_it() tmu.check_time('qwertyui')
# p_textarr = fu.read_lines(pos_text_file) # tokens_list = [pu.tokenize('[\_\w\-]{2,}', text) for text in p_textarr] # cnt = Counter(au.merge_array(tokens_list)) # print(cnt.most_common(100)) # exit() word_list = [ 'attack', 'bomb', 'bombing', 'kill', 'killed', 'explode', 'explosion', 'terrorist', 'suicide' ] _nlp = su.get_nlp() voacb = _nlp.vocab matrix_pre = np.array([voacb.get_vector(w) for w in word_list]) np.save('matrix_pre', matrix_pre) tmu.check_time() _nlp = train_spacy_model(_nlp) tmu.check_time() # dump_nlp(trained_spacy_path, _nlp) # tmu.check_time() # test_nlp(trained_spacy_path) voacb = _nlp.vocab matrix_post = np.array([voacb.get_vector(w) for w in word_list]) np.save('matrix_post', matrix_post) diff = matrix_pre - matrix_post diff = np.square(diff) print("diff:", np.sum(diff))
return featurearr, labelarr def coef_of_lr_model(file): from sklearn.externals import joblib lr_model = joblib.load(file) print(len(lr_model.coef_[0])) print(lr_model.coef_) if __name__ == "__main__": from classifying.terror.data_maker import fasttext_train, fasttext_test, ft_data_pattern ft_model = "/home/nfs/cdong/tw/src/models/classify/terror/ft_no_gpe_model" lr_model = "/home/nfs/cdong/tw/src/models/classify/terror/lr_no_gpe_model" clf_model = lr_model tmu.check_time('all') tmu.check_time() # coef_of_lr_model("/home/nfs/cdong/tw/src/models/classify/terror/lr_no_gpe_model") # clf_filter = ClassifierAddFeature(None, None) # for file in fi.listchildren("/home/nfs/cdong/tw/seeding/Terrorist/queried/positive", concat=True): # twarr = fu.load_array(file) # print(file, clf_filter.predict_mean_proba(twarr)) # tmu.check_time() # exit() batch_num = 20 fi.mkdir(ft_data_pattern.format('matrices_no_add'), remove_previous=True) train_mtx_ptn = ft_data_pattern.format( 'matrices_no_add/train_feature_mtx_{}.npy') train_lbl_ptn = ft_data_pattern.format(
from calling.back_filter import filter_twarr_text # from classifying.terror.classifier_terror import file2label_text_array # textarr, labelarr = file2label_text_array("/home/nfs/cdong/tw/seeding/Terrorist/data/test") pos_base = "/home/nfs/cdong/tw/seeding/Terrorist/queried/positive" neg_base = "/home/nfs/cdong/tw/seeding/Terrorist/queried/negative" pos_files, neg_files = fi.listchildren(pos_base, concat=True), fi.listchildren(neg_base, concat=True, pattern='2012') base = "/home/nfs/yangl/event_detection/testdata/event2012/relevant" pos_files = fi.listchildren(base, concat=True) print(len(pos_files)) print(sum([len(fu.read_lines(f)) for f in pos_files])) exit() my_filter = EffectCheck() pos_probarr, neg_probarr = list(), list() tmu.check_time() for file in neg_files: twarr = filter_twarr_text(fu.load_array(file)) probarr = my_filter.predict_proba(twarr) neg_probarr.extend(probarr) tmu.check_time() for file in pos_files: probarr = my_filter.predict_proba(fu.load_array(file)) pos_probarr.extend(probarr) # post_twarr = list() # for idx in range(len(probarr)): # if probarr[idx] >= 0.35: # post_twarr.append(twarr[idx]) # else: # print(twarr[idx][tk.key_text])
# positive_twarr = fu.load_array('/home/nfs/yangl/dc/calling/filtered_twarr.json')[:5000] # _sub_files = fi.listchildren("/home/nfs/yangl/dc/input", fi.TYPE_FILE, concat=True) # if (_idx + 1) % 1000 == 0: # dt = tmu.check_time('main line 116', print_func=None) # emu.send_email('file {}/{}'.format(_idx + 1, len(_sub_files)), '{}s from last 1000 file'.format(dt)) # if _idx > 0 and _idx % 10 == 0: # print("main: {} th twarr to filter, len: {}".format(_idx, len(_twarr))) tmu.check_time('start running') twarr2filter(_twarr) filter2cluster() if alarm.is_time_elapse_bigger_than(check_every_sec): alarm.initialize_timestamp() filter2cluster(5) bclu.execute_cluster() cluster2extractor() end_it() time_left = time_interval - tmu.check_time( 'start running', print_func=lambda dt: print('time epoch {}s'.format(dt))) print('sleeping.......') time.sleep(time_left if time_left > 0 else 0) _twarr, new_time = dbu.read_after_last_check(dbu.tweet_db, dbu.lxp_dataset, new_time) # tmu.check_time('qwertyui') if __name__ == '__main__': tmu.check_time() main() tmu.check_time( print_func=lambda dt: print("total time elapsed {}s".format(dt)))