def copy_into_process_base(process_base): if not fi.exists(process_base): fi.mkdir(process_base) os.system("cp %s %s" % (command, process_base)) os.system("cp %s %s" % (fi.join(autophrase_base, 'phrasal_segmentation.sh'), process_base)) os.system("cp -r %s %s" % (fi.join(autophrase_base, 'bin'), process_base)) os.system("cp -r %s %s" % (fi.join(autophrase_base, 'data'), process_base)) os.system("cp -r %s %s" % (fi.join(autophrase_base, 'tools'), process_base))
def set_base_path(self, base_path): self.base_path = fi.add_sep_if_needed(base_path) for path in [ self.get_base_path(), self.get_theme_path(), self.get_queried_path(), self.get_param_path(), ]: fi.mkdir(path)
def write_cic_list(path, cic_list): """ 调用 cic_list 中的各元素的 construct_od返回一个OrderedDIct,将每个OrderedDIct持久化到指定路径下的文件中 :param path: str,输出路径 :param cic_list: list,每个元素为 ClusterInfoCarrier :return: """ fi.mkdir(path, remove_previous=True) cic_list = sorted(cic_list, key=lambda item: len(item.twarr), reverse=True) print(' bext: output cic list, len={}'.format(len(cic_list))) for idx, cic in enumerate(cic_list): cluid = cic.cluid cic.twarr = ClusterInfoGetter.group_similar_tweets(cic.twarr, process_num=10) od = cic.construct_od() json_str = fu.dumps(od) cluster_file = fi.join(path, '{}_cluid:{}.json'.format(idx, cluid)) fu.write_lines(cluster_file, [json_str]) # textarr_file = fi.join(path, '{}_text.json'.format(idx)) # textarr = [tw[tk.key_text] for tw in cic.twarr] # fu.write_lines(textarr_file, textarr) print(' bext: output into files over')
ft_model = "/home/nfs/cdong/tw/src/models/classify/terror/ft_no_gpe_model" lr_model = "/home/nfs/cdong/tw/src/models/classify/terror/lr_no_gpe_model" clf_model = lr_model tmu.check_time('all') tmu.check_time() # coef_of_lr_model("/home/nfs/cdong/tw/src/models/classify/terror/lr_no_gpe_model") # clf_filter = ClassifierAddFeature(None, None) # for file in fi.listchildren("/home/nfs/cdong/tw/seeding/Terrorist/queried/positive", concat=True): # twarr = fu.load_array(file) # print(file, clf_filter.predict_mean_proba(twarr)) # tmu.check_time() # exit() batch_num = 20 fi.mkdir(ft_data_pattern.format('matrices_no_add'), remove_previous=True) train_mtx_ptn = ft_data_pattern.format( 'matrices_no_add/train_feature_mtx_{}.npy') train_lbl_ptn = ft_data_pattern.format( 'matrices_no_add/train_lblarr_mtx_{}.npy') train_file_list = [(train_mtx_ptn.format(idx), train_lbl_ptn.format(idx)) for idx in range(batch_num)] # _clf = ClassifierAddFeature(None, None) # _ft_args = dict(epoch=150, lr=1.5, wordNgrams=2, verbose=2, minCount=2, thread=20, dim=300) # tmu.check_time() # _clf.train_ft(fasttext_train, _ft_args, ft_model) # tmu.check_time(print_func=lambda dt: print('train ft time: {}s'.format(dt))) # generate_train_matrices(ft_model, fasttext_train, train_file_list) # tmu.check_time()
import shutil from subprocess import Popen, PIPE from config.configure import getcfg import utils.function_utils as fu import utils.array_utils as au import utils.file_iterator as fi import utils.multiprocess_utils as mu import utils.tweet_keys as tk import utils.timer_utils as tmu autophrase_base = getcfg().autophrase_path autophrase_output_base = fi.join( autophrase_base, "OUTPUTS/") # 保证任何autophrase的输出限制到output_base之内的某个角落 command = fi.join(autophrase_base, "auto_phrase.sh") fi.mkdir(autophrase_output_base) def autophrase(input_text_file, output_path, commander, process_base, min_sup): p = Popen(commander, shell=True, bufsize=1, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=process_base) p.stdin.flush() p.stdin.write((input_text_file + '\n').encode("utf8")) p.stdin.write((output_path + '\n').encode("utf8")) p.stdin.write((min_sup + '\n').encode("utf8")) p.stdin.flush()
return print('new list generated, hbn={}, rbn={}'.format(hbn, rbn)) cluid_twarr_list = self.gsdpmm.get_cluid_twarr_list(twnum_thres) if cluid_twarr_list: out_channel.put(cluid_twarr_list) if __name__ == '__main__': import utils.tweet_keys as tk import utils.array_utils as au import utils.pattern_utils as pu import utils.timer_utils as tmu import calling.back_extractor as bext import utils.file_iterator as fi import utils.function_utils as fu fi.mkdir('/home/nfs/cdong/tw/src/calling/tmp', remove_previous=True) tmu.check_time() _hold_batch_num = 100 _batch_size = 100 _alpha, _beta = 30, 0.01 # _alpha, _beta = 50, 0.005 _file = "./filtered_twarr.json" _twarr = fu.load_array(_file)[:10200] start_pool(_hold_batch_num, _batch_size, _alpha, _beta) input_twarr_batch(_twarr) print('---> waiting for _cluid_cluster_list') while True: _cluid_cluster_list = cluster_daemon.outq2.get() print(' - some thing returned, type :{}'.format(
class ExtractSubProcess(DaemonProcess): def start(self, func): self.process = mp.Process(target=func, args=(self.inq, self.outq)) self.process.daemon = False self.process.start() # print('ExtractSubProcess', self.process.pid) extract_sub_process = ExtractSubProcess() END_PROCESS = -1 SET_PARAMS = 0 INPUT_LIST = 1 OUTPUT_LIST = 3 OUT_BASE = getcfg().output_path fi.mkdir(OUT_BASE, remove_previous=True) def extract_sub_main(inq, outq): """ 聚类信息提取模块子进程的主函数,该进程还有下属的若干子进程用于执行实际操作, 负责读取主进程输入,调用子进程组进行一次地点提取,根据返回结果,对多个聚类依地点进行合并操作, 并再次调用子进程组对合并结果进行完整处理,以该结果为最终结果输出到文件 :param inq: mp.Queue,主进程向子进程的输入队列 :param outq: mp.Queue,子进程向主进程的输出队列 :return: """ pool_size, event_type = inq.get() extract_pool = CustomDaemonPool() extract_pool.start(extract_pool_main, pool_size) extract_pool.set_batch_input([(pidx, event_type)
mtx_list.append(np.load(mtx_file)) lbl_list.append(np.load(lbl_file)) featurearr = np.concatenate(mtx_list, axis=0) labelarr = np.concatenate(lbl_list, axis=0) return featurearr, labelarr if __name__ == "__main__": from classifying.natural_disaster.nd_data_maker import nd_train, nd_test, nd_data_pattern nd_ft_model = "/home/nfs/cdong/tw/src/models/classify/natural_disaster/ft_model" nd_lr_model = "/home/nfs/cdong/tw/src/models/classify/natural_disaster/lr_model" tmu.check_time('all') tmu.check_time() batch_num = 20 fi.mkdir(nd_data_pattern.format('matrices')) nd_train_mtx = nd_data_pattern.format('matrices/train_feature_mtx_{}.npy') nd_train_lbl = nd_data_pattern.format('matrices/train_lblarr_mtx_{}.npy') nd_train_mtx_files = [(nd_train_mtx.format(idx), nd_train_lbl.format(idx)) for idx in range(batch_num)] _clf = ClassifierNaturalDisaster(None, None) # _ft_args = dict(epoch=150, lr=1.5, wordNgrams=2, verbose=2, minCount=2, thread=20, dim=250) # _clf.train_ft(nd_train, _ft_args, nd_ft_model) # tmu.check_time(print_func=lambda dt: print('train ft time: {}s'.format(dt))) generate_train_matrices(nd_ft_model, nd_train, nd_train_mtx_files) tmu.check_time() _featurearr, _labelarr = recover_train_matrix(nd_train_mtx_files) tmu.check_time() _lr_args = dict(n_jobs=20,