def parse_query_list(from_path, into_path, query_list, process_num): all_sub_files = [ f for f in fi.listchildren( from_path, children_type=fi.TYPE_FILE, pattern='.sum$') ] tw_num_sum = 0 for query in query_list: query = SeedQuery(*query) query_sub_files = [ fi.join(from_path, f) for f in all_sub_files if query.is_time_desired( tw_ymd=query.time_of_tweet(f, source='filename')) ] print('{} files from {} to {}'.format( len(query_sub_files), query_sub_files[0][query_sub_files[0].rfind('/') + 1:], query_sub_files[-1][query_sub_files[-1].rfind('/') + 1:], )) twarr = query_from_files_multi(query_sub_files, query, process_num) tw_num_sum += len(twarr) file_name = query.to_string() + '.json' if len(twarr) > 20: print('file {} written\n'.format(file_name)) fu.dump_array(fi.join(into_path, file_name), twarr) else: print('twarr not long enough') for tw in twarr: print(tw[tk.key_text], '\n') print('total tweet number: {}'.format(tw_num_sum))
def copy_into_process_base(process_base): if not fi.exists(process_base): fi.mkdir(process_base) os.system("cp %s %s" % (command, process_base)) os.system("cp %s %s" % (fi.join(autophrase_base, 'phrasal_segmentation.sh'), process_base)) os.system("cp -r %s %s" % (fi.join(autophrase_base, 'bin'), process_base)) os.system("cp -r %s %s" % (fi.join(autophrase_base, 'data'), process_base)) os.system("cp -r %s %s" % (fi.join(autophrase_base, 'tools'), process_base))
def autophrase_wrapper(process_code, textarr): # process_code用于辨识进程所占用的路径,textarr是一个文本list process_base = fi.join(autophrase_output_base, str(process_code)) copy_into_process_base(process_base) commander = fi.join(process_base, "auto_phrase.sh") input_text_file = fi.join(process_base, "raw_train.txt") output_keyword_file = fi.join(process_base, "AutoPhrase.txt") # 将文本列表写入文件, 执行autophrase fu.write_lines(input_text_file, textarr) min_sup = determine_min_sup(len(textarr)) autophrase(input_text_file, process_base, commander, process_base, min_sup) # 读取autophrase结果 lines = fu.read_lines(output_keyword_file) conf_word_list = list() for line in lines: conf, word = line.split(maxsplit=1) conf_word_list.append((float(conf), word)) # fi.rmtree(os.path.join(process_base, 'tmp')) return conf_word_list
def extract_sub_main(inq, outq): """ 聚类信息提取模块子进程的主函数,该进程还有下属的若干子进程用于执行实际操作, 负责读取主进程输入,调用子进程组进行一次地点提取,根据返回结果,对多个聚类依地点进行合并操作, 并再次调用子进程组对合并结果进行完整处理,以该结果为最终结果输出到文件 :param inq: mp.Queue,主进程向子进程的输入队列 :param outq: mp.Queue,子进程向主进程的输出队列 :return: """ pool_size, event_type = inq.get() extract_pool = CustomDaemonPool() extract_pool.start(extract_pool_main, pool_size) extract_pool.set_batch_input([(pidx, event_type) for pidx in range(pool_size)]) extract_pool.get_batch_output() counter = 0 while True: command = inq.get() if command == INPUT_LIST: cluid_twarr_list = inq.get() print(' bext: remain task={}, len(cluid_twarr_list)={}'.format( inq.qsize(), len(cluid_twarr_list))) if inq.qsize() > 6: print(' bext: too many tasks undone, jumping') continue # tmu.check_time('extract_sub_main', print_func=lambda dt: print('lline 41', dt)) # print(" bext: get new cluid_twarr_list, len {}".format(len(cluid_twarr_list))) extract_pool.set_batch_input(cluid_twarr_list) cic_list = extract_pool.get_batch_output() # tmu.check_time('extract_sub_main', print_func=lambda dt: print('lline 43', dt)) new_cluid_twarr_list = merge_cic_list2cluid_twarr_list(cic_list) new_cluid_twarr_list = [(cluid, twarr, CIG.TAR_FULL) for cluid, twarr in new_cluid_twarr_list] extract_pool.set_batch_input(new_cluid_twarr_list) n_cic_list = extract_pool.get_batch_output() # format = cic_format(n_cic_list) # tmu.check_time('extract_sub_main', print_func=lambda dt: print('lline 49', dt)) # print(" bext: get merged cic list, len {}".format(len(n_cic_list))) # collection = '{}_{}clusters'.format(counter,len(n_cic_list)) # dbu.insert_many(dbu.clu_db,collection,format) write_cic_list( fi.join(OUT_BASE, "{}_{}clusters/").format(counter, len(n_cic_list)), n_cic_list) counter += 1 elif command == END_PROCESS: print('ending extract_sub_main') outq.put('ending extract_sub_main') return
def write_cic_list(path, cic_list): """ 调用 cic_list 中的各元素的 construct_od返回一个OrderedDIct,将每个OrderedDIct持久化到指定路径下的文件中 :param path: str,输出路径 :param cic_list: list,每个元素为 ClusterInfoCarrier :return: """ fi.mkdir(path, remove_previous=True) cic_list = sorted(cic_list, key=lambda item: len(item.twarr), reverse=True) print(' bext: output cic list, len={}'.format(len(cic_list))) for idx, cic in enumerate(cic_list): cluid = cic.cluid cic.twarr = ClusterInfoGetter.group_similar_tweets(cic.twarr, process_num=10) od = cic.construct_od() json_str = fu.dumps(od) cluster_file = fi.join(path, '{}_cluid:{}.json'.format(idx, cluid)) fu.write_lines(cluster_file, [json_str]) # textarr_file = fi.join(path, '{}_text.json'.format(idx)) # textarr = [tw[tk.key_text] for tw in cic.twarr] # fu.write_lines(textarr_file, textarr) print(' bext: output into files over')
import os import shutil from subprocess import Popen, PIPE from config.configure import getcfg import utils.function_utils as fu import utils.array_utils as au import utils.file_iterator as fi import utils.multiprocess_utils as mu import utils.tweet_keys as tk import utils.timer_utils as tmu autophrase_base = getcfg().autophrase_path autophrase_output_base = fi.join( autophrase_base, "OUTPUTS/") # 保证任何autophrase的输出限制到output_base之内的某个角落 command = fi.join(autophrase_base, "auto_phrase.sh") fi.mkdir(autophrase_output_base) def autophrase(input_text_file, output_path, commander, process_base, min_sup): p = Popen(commander, shell=True, bufsize=1, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=process_base) p.stdin.flush() p.stdin.write((input_text_file + '\n').encode("utf8")) p.stdin.write((output_path + '\n').encode("utf8")) p.stdin.write((min_sup + '\n').encode("utf8"))
def get_param_path(self): return fi.join(self.get_theme_path(), 'params')
def get_queried_path(self): return fi.join(self.get_theme_path(), 'queried')
def get_theme_path(self): return fi.join(self.get_base_path(), self.theme)