def make_train_test(): p_file = ft_data_pattern.format("pos_2016.txt") n_bad_files = fi.listchildren(ft_data_pattern.format(''), fi.TYPE_FILE, concat=True, pattern='2016_bad') n_2017_files = fi.listchildren(ft_data_pattern.format(''), fi.TYPE_FILE, concat=True, pattern='2017') # n_2012_fulls = fi.listchildren(ft_data_pattern.format(''), fi.TYPE_FILE, concat=True, pattern='2012_full')[:12] n_2012_fulls = fi.listchildren(ft_data_pattern.format(''), fi.TYPE_FILE, concat=True, pattern='2012_full') n_2016_files = fi.listchildren(ft_data_pattern.format(''), fi.TYPE_FILE, concat=True, pattern='2016_queried') print(len(n_bad_files), len(n_2017_files), len(n_2012_fulls), len(n_2016_files)) n_files = n_bad_files + n_2017_files + n_2012_fulls + n_2016_files p_txtarr = fu.read_lines(p_file) p_prefix_txtarr = prefix_textarr(label_t, p_txtarr) n_txtarr_blocks = [fu.read_lines(file) for file in n_files] n_prefix_txtarr_blocks = [ prefix_textarr(label_f, txtarr) for txtarr in n_txtarr_blocks ] train_test = list() bad = len(n_bad_files) bad_blocks, n_blocks = n_prefix_txtarr_blocks[: bad], n_prefix_txtarr_blocks[ bad:] train_test.append(split_train_test(p_prefix_txtarr)) train_test.extend([split_train_test(block) for block in n_blocks]) print("len(train_test)", len(train_test)) train_list, test_list = zip(*train_test) train_list = list(train_list) + bad_blocks train_txtarr = au.merge_array(train_list) test_txtarr = au.merge_array(test_list) fu.write_lines(fasttext_train, train_txtarr) fu.write_lines(fasttext_test, test_txtarr) print("len(train_list)", len(train_list), "len(train_txtarr)", len(train_txtarr), "len(test_txtarr)", len(test_txtarr))
def merge_cic_list2cluid_twarr_list(cic_list): """ 分析输入的 cic_list 中每个对象所指示的频次最高的地点信息,按照这一信息进行聚类合并 :param cic_list: list,每个元素为 ClusterInfoCarrier :return: list,每个元素为tuple, 见 clustering.gsdpmm.gsdpmm_stream_ifd_dynamic.GSDPMMStreamIFDDynamic#get_cluid_twarr_list """ geo2group_id, cluid2group = dict(), dict() for cic in cic_list: cluid, clu_geo_table = cic.cluid, cic.geo_table s_geo_table = cic.s_geo_table if len(s_geo_table) == 0: cluid2group[cluid] = [cic] else: clu_top_geo = s_geo_table[0]['address'] if clu_top_geo not in geo2group_id: group_id = cluid geo2group_id[clu_top_geo] = group_id cluid2group[group_id] = [cic] else: group_id = geo2group_id[clu_top_geo] cluid2group[group_id].append(cic) new_cluid_twarr_list = list() for group_id, group_cic_list in cluid2group.items(): new_cluid = group_cic_list[0].cluid new_twarr = au.merge_array([cic.twarr for cic in group_cic_list]) new_cluid_twarr_list.append((new_cluid, new_twarr)) return new_cluid_twarr_list
def make_text_files(): for idx, file in enumerate(neg_2012_full_files): twarr = fu.load_array(file) txtarr = list() for tw in twarr: text = pu.text_normalization(tw[tk.key_text]) if pu.is_empty_string(text) or len(text) < 20: continue txtarr.append(text) print('len delta', len(twarr) - len(txtarr)) path = Path(file) out_file_name = '_'.join([path.parent.name, path.name]).replace('json', 'txt') out_file = ft_data_pattern.format(out_file_name) print(out_file) fu.write_lines(out_file, txtarr) return p_twarr_blocks = map(fu.load_array, pos_files) p_txtarr_blocks = map(twarr2textarr, p_twarr_blocks) p_txtarr = au.merge_array(list(p_txtarr_blocks)) p_out_file = ft_data_pattern.format('pos_2016.txt') fu.write_lines(p_out_file, p_txtarr) for f in neg_files: in_file = neg_event_pattern.format(f) out_file = ft_data_pattern.format(f.replace("json", "txt")) twarr = fu.load_array(in_file) txtarr = twarr2textarr(twarr) print(len(twarr), '->', len(txtarr), len(twarr) - len(txtarr)) fu.write_lines(out_file, txtarr)
def exec_pre_test(test_data_path): subfiles = fi.listchildren(test_data_path, children_type='file') # file_list = fu.split_multi_format( # [(test_data_path + file) for file in subfiles if file.endswith('.json')], process_num=6) # twarr_blocks = fu.multi_process(fi.summary_unzipped_tweets_multi, # [(file_list_slice,) for file_list_slice in file_list]) twarr_blocks = filter_twarr( [fu.load_array(file) for file in subfiles if file.endswith('.json')]) twarr = au.merge_array(twarr_blocks) tu.start_ner_service(pool_size=16) tu.twarr_ner(twarr) tu.end_ner_service() all_ids = set(fu.load_array(test_data_path + 'test_ids_all.csv')) pos_ids = set(fu.load_array(test_data_path + 'test_ids_pos.csv')) non_pos_ids = all_ids.difference(pos_ids) pos_twarr = list() non_pos_twarr = list() for tw in twarr: twid = tw[tk.key_id] if twid in pos_ids: pos_twarr.append(tw) elif twid in non_pos_ids: non_pos_twarr.append(tw) fu.dump_array(getcfg().pos_data_file, pos_twarr) fu.dump_array(getcfg().non_pos_data_file, non_pos_twarr)
def get_semantic_tokens(file_list): pos_type_info = { ark.prop_label: { K_IFD: IdFreqDict() }, ark.comm_label: { K_IFD: IdFreqDict() }, ark.verb_label: { K_IFD: IdFreqDict() }, ark.hstg_label: { K_IFD: IdFreqDict() }, } total_doc_num = 0 for file in file_list: twarr = ark.twarr_ark(fu.load_array(file)) total_doc_num += len(twarr) pos_tokens = au.merge_array([tw[tk.key_ark] for tw in twarr]) for pos_token in pos_tokens: word = pos_token[0].strip().lower() if len(word) <= 2 or not pu.is_valid_keyword(word): continue real_label = ark.pos_token2semantic_label(pos_token) if real_label: pos_type_info[real_label][K_IFD].count_word(word) return pos_type_info, total_doc_num
def filter2cluster(remain_workload=None): """ 从过滤&分类模块读取其返回的推特列表,合并后输入聚类模块 :param remain_workload: int/None,控制从过滤&分类模块读取结果的行为 若为None,则调用 bflt.try_get_unread_batch_output 获取返回结果; 否则以其为参数调用 bflt.wait_get_unread_batch_output 获取返回结果 :return: """ if remain_workload is None: batches_of_batches = bflt.try_get_unread_batch_output() else: batches_of_batches = bflt.wait_get_unread_batch_output(remain_workload) if not batches_of_batches: return filtered_batches = au.merge_array(batches_of_batches) filtered_twarr = au.merge_array(filtered_batches) bclu.input_twarr(filtered_twarr)
def identify_korea(): file = '/home/nfs/cdong/tw/seeding/NorthKorea/korea.json' twarr_blocks = fu.load_array(file) twarr = au.merge_array(twarr_blocks) for tw in twarr: text = tw[tk.key_text] if not re.search('korea', text, flags=re.I): print(text)
def set_batches(self, tw_batches): self.tw_batches.clear() for batch_idx in range(len(tw_batches)): self.tw_batches.append( GSDPMMStreamIFDDynamic.pre_process_twarr( tw_batches[batch_idx])) cluid_set = set( [tw[tk.key_event_cluid] for tw in au.merge_array(tw_batches)]) self.cludict = dict([(cluid, ClusterHolder(cluid)) for cluid in cluid_set])
def filter2cluster(): """ Read output from filter and transfer it to the cluster :return """ filtered_batches = bflt.get_batch_output() filtered_twarr = au.merge_array(filtered_batches) print(len(filtered_twarr)) bclu.input_twarr_batch(filtered_twarr) print('input to cluster over')
def query_from_files_multi(file_list, query, n_process=10): """ as there may be many files, we handle them through processes """ file_blocks = mu.split_multi_format(file_list, n_process) res_list = mu.multi_process(query_from_files, args_list=[(block, query) for block in file_blocks]) twarr = au.merge_array(res_list) print('len(res_list):{}, len(twarr):{}'.format(len(res_list), len(twarr)), end=', ') return twarr
def order_twarr_through_time(self): print("data source : normal") event_blocks = fu.load_array("./data/events2016.txt") false_event_twarr = fu.load_array("./data/false_pos_events.txt") event_blocks.append(false_event_twarr) for block_idx, block in enumerate(event_blocks): for tw in block: tw[tk.key_event_label] = block_idx twarr = au.merge_array(event_blocks) tflt.filter_twarr_dup_id(twarr) def random_idx_for_item(item_arr, dest_item): from numpy import random def sample(prob): return random.rand() < prob non_dest_item_idx = [ idx for idx in range(len(item_arr)) if item_arr[idx] not in dest_item ] dest_item_idx = [ idx for idx in range(len(item_arr)) if item_arr[idx] in dest_item ] non_dest_cnt = dest_cnt = 0 res = list() while len(non_dest_item_idx) > non_dest_cnt and len( dest_item_idx) > dest_cnt: if sample((len(dest_item_idx) - dest_cnt) / (len(dest_item_idx) - dest_cnt + len(non_dest_item_idx) - non_dest_cnt)): res.append(dest_item_idx[dest_cnt]) dest_cnt += 1 else: res.append(non_dest_item_idx[non_dest_cnt]) non_dest_cnt += 1 while len(non_dest_item_idx) > non_dest_cnt: res.append(non_dest_item_idx[non_dest_cnt]) non_dest_cnt += 1 while len(dest_item_idx) > dest_cnt: res.append(dest_item_idx[dest_cnt]) dest_cnt += 1 return res idx_time_order = tu.rearrange_idx_by_time(twarr) twarr = [twarr[idx] for idx in idx_time_order] lbarr = self.lbarr_of_twarr(twarr) idx_random_item = random_idx_for_item(lbarr, {max(lbarr)}) twarr = [twarr[idx] for idx in idx_random_item] return twarr
def load_tw_batches(self, load_cluid_arr): tw_batches = fu.load_array(self.labelled_batch_file) tu.twarr_nlp(au.merge_array(tw_batches)) print("twarr nlp over") if load_cluid_arr: cluid_batches = fu.load_array(self.cluid_batch_file) assert len(tw_batches) == len(cluid_batches) for b_idx in range(len(tw_batches)): tw_batch, cluid_batch = tw_batches[b_idx], cluid_batches[b_idx] assert len(tw_batch) == len(cluid_batch) for idx in range(len(tw_batch)): tw, cluid = tw_batch[idx], cluid_batch[idx] tw[tk.key_event_cluid] = cluid return tw_batches
def group_similar_tweets(twarr, process_num=0): """ 按照文本相似程度,调整推特列表中各推特的顺序,使得相近的文本被安排到相近的位置上; 对一组文本进行两两比较操作复杂度O(n^2),文本超过1000条就已十分耗时,分进程并行操作 :param twarr: list,推特列表 :param process_num: 使用的子进程的数量 :return: list,排序后的推特列表 """ txtarr = [tw[tk.key_text] for tw in twarr] idx_g, txt_g = au.group_similar_items(txtarr, score_thres=0.3, process_num=process_num) tw_groups = [[twarr[idx] for idx in g] for g in idx_g] return au.merge_array(tw_groups)
def twarr_dist_pairs_multi(twarr): for tw in twarr: tw['nouse'] = tw['text'].lower() total = len(twarr) - 1 process_num = 16 point_lists = [[ i + 16 * j for j in range(int(total / process_num) + 1) if (i + process_num * j) < total ] for i in range(process_num)] pairs_blocks = multi_process(dist_pairs, [(twarr, point) for point in point_lists]) for tw in twarr: del tw['nouse'] return merge_array(pairs_blocks)
def summary_files_in_path(from_path, into_path=None): """ Read all .json under file_path, extract tweets from them into a file under summary_path. """ # [-13:]--hour [-13:-3]--day [-13:-5]--month,ymdh refers to the short of "year-month-date-hour" from_path = fi.add_sep_if_needed(from_path) file_ymdh_arr = pu.split_digit_arr(fi.get_parent_path(from_path)[-13:]) if not is_target_ymdh(file_ymdh_arr): return into_file = '{}{}'.format(fi.add_sep_if_needed(into_path), '_'.join(file_ymdh_arr) + '.sum') fi.remove_file(into_file) subfiles = fi.listchildren(from_path, children_type=fi.TYPE_FILE) file_block = mu.split_multi_format([(from_path + subfile) for subfile in subfiles], process_num=20) twarr_blocks = mu.multi_process(sum_files, [(file_list, tflt.FILTER_LEVEL_LOW) for file_list in file_block]) twarr = au.merge_array(twarr_blocks) if twarr: fu.dump_array(into_file, twarr, overwrite=True)
def dist_pairs(twarr, points): return merge_array([[(i, j, text_dist_less_than(twarr[i]['temp'], twarr[j]['temp'])) for j in range(i + 1, len(twarr))] for i in points])
_cic_list = bext.get_batch_output() print('get cic outputs, type:{}'.format(type(_cic_list))) for cic in _cic_list: twnum = len(cic.twarr) _geo_list = [ geo['address'] for geo in cic.od['geo_infer'] if geo['quality'] == 'locality' ] print('cluid:{}, twarr len:{}'.format(cic.cluid, twnum)) print(cic.od['summary']['keywords']) print(_geo_list) print('\n') if len(_geo_list) == 0: _top_geo = 'NOGPE' else: _top_geo = '`'.join(_geo_list) _out_file = '/home/nfs/cdong/tw/src/calling/tmp/id{}_tw{}_{}.txt'.format( cic.cluid, twnum, _top_geo) _txtarr = [tw[tk.key_text] for tw in cic.twarr] _idx_g, _txt_g = au.group_similar_items(_txtarr, score_thres=0.3, process_num=20) _txt_g = [ sorted(g, key=lambda t: len(t), reverse=True) for g in _txt_g ] _txtarr = au.merge_array(_txt_g) fu.write_lines(_out_file, _txtarr) tmu.check_time()
def get_current_twharr(self): return au.merge_array(self.twh_batches)
# print(twarr[idx][tk.key_text]) # post_twarr = [tw for idx, tw in enumerate(twarr) if probarr[idx] >= 0.4] # post_total_len += len(post_twarr) # print(len(post_twarr) / len(twarr), '\n\n\n') tmu.check_time() lblarr = [1 for _ in range(len(pos_probarr))] + [0 for _ in range(len(neg_probarr))] prbarr = pos_probarr + neg_probarr fu.dump_array("prb_lbl_arr.txt", (lblarr, prbarr)) lblarr, prbarr = fu.load_array("prb_lbl_arr.txt") au.precision_recall_threshold(lblarr, prbarr) # print('total portion = {} / {} = {}'.format(post_total_len, pre_total_len, post_total_len / pre_total_len)) tmu.check_time() exit() sub_files = fi.listchildren('/home/nfs/cdong/tw/origin/', fi.TYPE_FILE, concat=True)[18:19] twarr = au.merge_array([fu.load_array(file) for file in sub_files]) print(len(twarr)) tmu.check_time(print_func=None) for idx, tw in enumerate(twarr[14000:15000]): if (idx + 1) % 1000 == 0: print(idx) try: my_filter.get_features(tw) except: # print(tw[tk.key_text]) # print(tw[tk.key_orgntext]) print('-', pu.text_normalization(tw[tk.key_orgntext])) tmu.check_time(print_func=lambda dt: print('pos filter time elapsed {}s'.format(dt))) exit()
def make_tw_batches(self, batch_size): ordered_twarr = self.order_twarr_through_time() tw_batches = split_array_into_batches(ordered_twarr, batch_size) self.twarr_info(au.merge_array(tw_batches)) fu.dump_array(self.labelled_batch_file, tw_batches)