def get_semantic_tokens_multi(file_path): pos_type_info = { ark.prop_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_prop_dict_file}, ark.comm_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_comm_dict_file}, ark.verb_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_verb_dict_file}, ark.hstg_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_hstg_dict_file}, } total_doc_num = 0 file_path = fi.add_sep_if_needed(file_path) # subfiles = au.random_array_items(fi.listchildren(file_path, children_type=fi.TYPE_FILE), 40) subfiles = fi.listchildren(file_path, children_type=fi.TYPE_FILE) file_list_block = mu.split_multi_format([(file_path + subfile) for subfile in subfiles], process_num=20) res_list = mu.multi_process(get_semantic_tokens, [(file_list,) for file_list in file_list_block]) for res_type_info, doc_num in res_list: total_doc_num += doc_num for label in res_type_info.keys(): pos_type_info[label][K_IFD].merge_freq_from(res_type_info[label][K_IFD]) print('total_doc_num', total_doc_num) for label in pos_type_info.keys(): ifd, file_name = pos_type_info[label][K_IFD], pos_type_info[label][K_FILE] ifd.drop_words_by_condition(3) if label != ark.hstg_label: ifd.drop_words_by_condition(lambda word, _: word.startswith('#')) ifd.dump_dict(file_name) print('{}; vocab size:{}'.format(file_name, ifd.vocabulary_size()))
def tsne_data_multi(using): assert using in {'avg', 'tfidf'} kw_arg = dict(early_exaggeration=12, n_iter=800, n_iter_without_progress=100) args_list = [(data_class, using, kw_arg) for data_class in object_list] tsne_list = mu.multi_process(tsne_data, args_list) print(len(tsne_list), len(tsne_list[0])) np.save('tsne_{}.npy'.format(using), np.array(tsne_list, dtype=object))
def _load_word2vec_from_glovec_file(glovec_file, process_num): lines_parts = mu.split_multi(iu.read_lines(glovec_file), process_num) print('lines read & partition over') word2vec_parts = mu.multi_process(_word2vec_from_lines, [(lines, ) for lines in lines_parts]) word2vec = dict() for word2vec_part in word2vec_parts: word2vec.update(word2vec_part) return word2vec
def query_from_files_multi(file_list, query, n_process=10): """ as there may be many files, we handle them through processes """ file_blocks = mu.split_multi_format(file_list, n_process) res_list = mu.multi_process(query_from_files, args_list=[(block, query) for block in file_blocks]) twarr = au.merge_array(res_list) print('len(res_list):{}, len(twarr):{}'.format(len(res_list), len(twarr)), end=', ') return twarr
def clustering_multi(func, params, process_num=20): param_num = len(params) res_list = list() for i in range(int(math.ceil(param_num / process_num))): res_list += mu.multi_process( func, params[i * process_num:(i + 1) * process_num]) print('{:<4} / {} params processed'.format( min((i + 1) * process_num, param_num), param_num)) if not len(res_list) == len(params): raise ValueError('Error occur in clustering') return res_list
def pairwise_score_multi(array, process_num, pair_func): total = len(array) pairs = [(i, j) for i in range(total - 1) for j in range(i + 1, total)] if process_num <= 1 or total < 40: idx_pair_score_list = pairwise_score(array, pairs, pair_func) else: pair_blocks = mu.split_multi_format(pairs, process_num) arg_list = [(array, idx_pairs, pair_func) for idx_pairs in pair_blocks] score_pairs_blocks = mu.multi_process(pairwise_score, arg_list) idx_pair_score_list = merge_array(score_pairs_blocks) return idx_pair_score_list
def get_tokens_multi(file_path): file_path = fi.add_sep_if_needed(file_path) # subfiles = au.random_array_items(fi.listchildren(file_path, children_type=fi.TYPE_FILE), 20) subfiles = fi.listchildren(file_path, children_type=fi.TYPE_FILE) file_list_block = mu.split_multi_format([(file_path + subfile) for subfile in subfiles], process_num=20) res_list = mu.multi_process(get_tokens, [(file_list,) for file_list in file_list_block]) id_freq_dict, total_doc_num = IdFreqDict(), 0 for ifd, doc_num in res_list: total_doc_num += doc_num id_freq_dict.merge_freq_from(ifd) print('total_doc_num', total_doc_num, 'total vocabulary_size', id_freq_dict.vocabulary_size()) id_freq_dict.drop_words_by_condition(3) id_freq_dict.dump_dict(getcfg().post_dict_file)
def make_neg_event_bad_text_2016(): files = fi.listchildren("/home/nfs/cdong/tw/origin/", fi.TYPE_FILE, concat=True) files_blocks = mu.split_multi_format(files, 4) output_file = neg_event_pattern.format("neg_2016_bad_text_{}.json") args_list = [(block, output_file.format(idx)) for idx, block in enumerate(files_blocks)] res_list = mu.multi_process(extract_bad_tweets_into, args_list) n_num_list, tw_num_list = zip(*res_list) total_n, total_tw = sum(n_num_list), sum(tw_num_list) print(n_num_list, tw_num_list, total_n, total_tw, round(total_n / total_tw, 6))
def twarr_dist_pairs_multi(twarr): for tw in twarr: tw['nouse'] = tw['text'].lower() total = len(twarr) - 1 process_num = 16 point_lists = [[ i + 16 * j for j in range(int(total / process_num) + 1) if (i + process_num * j) < total ] for i in range(process_num)] pairs_blocks = multi_process(dist_pairs, [(twarr, point) for point in point_lists]) for tw in twarr: del tw['nouse'] return merge_array(pairs_blocks)
def tsne_multi(): class_list = [DataTREC, DataGoogle, DataEvent, DataReuters, Data20ng] doc_avg_list = mu.multi_process(train_embedding, [[c] for c in class_list]) for m in doc_avg_list: print(m.shape) kw_arg = dict(early_exaggeration=12, n_iter=800, n_iter_without_progress=100) tsne_point_list = ru.fit_multi(ru.fit_tsne, doc_avg_list, [kw_arg] * len(doc_avg_list)) for m in tsne_point_list: print(m.shape) name_point_list = list(zip([c.name for c in class_list], tsne_point_list)) return name_point_list
def summary_files_in_path(from_path, into_path=None): """ Read all .json under file_path, extract tweets from them into a file under summary_path. """ # [-13:]--hour [-13:-3]--day [-13:-5]--month,ymdh refers to the short of "year-month-date-hour" from_path = fi.add_sep_if_needed(from_path) file_ymdh_arr = pu.split_digit_arr(fi.get_parent_path(from_path)[-13:]) if not is_target_ymdh(file_ymdh_arr): return into_file = '{}{}'.format(fi.add_sep_if_needed(into_path), '_'.join(file_ymdh_arr) + '.sum') fi.remove_file(into_file) subfiles = fi.listchildren(from_path, children_type=fi.TYPE_FILE) file_block = mu.split_multi_format([(from_path + subfile) for subfile in subfiles], process_num=20) twarr_blocks = mu.multi_process(sum_files, [(file_list, tflt.FILTER_LEVEL_LOW) for file_list in file_block]) twarr = au.merge_array(twarr_blocks) if twarr: fu.dump_array(into_file, twarr, overwrite=True)
def fit_multi(func, xs, kwargs_list=None): import utils.multiprocess_utils as mu res_list = mu.multi_process(func, [(x,) for x in xs], kwargs_list) return res_list
def matrix2str_list_multi(matrix, delimeter, ndigits, process_num): matrix_parts = mu.split_multi(matrix, process_num) arg_list = [(matrix_part, delimeter, ndigits) for matrix_part in matrix_parts] return au.merge(mu.multi_process(matrix2str_list, arg_list))