def __gen_minidocs_new(): entity_names = commonutils.read_lines_to_list(entity_names_file) entity_names = ['白起'] minidocs_info_list = list() fout_text = open('d:/data/indec/docs-14k-minidocs-text.txt', 'w', encoding='utf-8', newline='\n') fout_seg_text = open('d:/data/indec/docs-14k-minidocs-text-seg.txt', 'w', encoding='utf-8', newline='\n') for i, entity_name in enumerate(entity_names): info_list_tmp = __minidocs_for_name(entity_name, fout_text, fout_seg_text) minidocs_info_list += info_list_tmp # if i == 1: # break fout_text.close() fout_seg_text.close() minidocs_info_list = [(i, doc_id, name) for i, (doc_id, name) in enumerate(minidocs_info_list)] with open('d:/data/indec/docs-14k-minidocs-info.txt', 'w', encoding='utf-8', newline='\n') as fout: pd.DataFrame(minidocs_info_list, columns=['mdid', 'doc_id', 'entity_name']).to_csv(fout, index=False)
def __gen_minidocs(): entity_names = commonutils.read_lines_to_list(entity_names_file) f = open(WC_SENT_FILE, encoding='utf-8') fout_text = open(WC_MINIDOC_TEXT_FILE, 'w', encoding='utf-8', newline='\n') n_context_sents = 2 doc_cnt, minidoc_cnt = 0, 0 minidocs_info_list = list() while True: doc_sents = __read_doc_sents(f) if doc_sents is None: break for name in entity_names: i = 0 while i < len(doc_sents): sent = doc_sents[i] if name not in sent: i += 1 continue s_idx_beg = max(i - n_context_sents, 0) p = i + 1 max_hit_pos = i while p < len(doc_sents): if name in doc_sents[p]: max_hit_pos = p if p - max_hit_pos >= n_context_sents * 2: break p += 1 i = p + 1 s_idx_end = min(max_hit_pos + n_context_sents + 1, len(doc_sents)) minidoc_text = ''.join(doc_sents[s_idx_beg:s_idx_end]) minidocs_info_list.append((minidoc_cnt, doc_cnt, name)) # minidoc = {'mdid': minidoc_cnt, 'doc_id': doc_cnt, 'text': minidoc_text, 'entity_name': name} minidoc_cnt += 1 # fout.write('{}\n'.format(json.dumps(minidoc, ensure_ascii=False))) fout_text.write('{}\n'.format(minidoc_text)) # print(name, cnt) # for s in doc_sents[s_idx_beg:s_idx_end]: # print(s) # print(doc_sents[s_idx_beg:s_idx_end]) # print() doc_cnt += 1 # if doc_cnt > 10: # break if doc_cnt % 1000 == 0: print(doc_cnt) f.close() # fout.close() fout_text.close() print(doc_cnt, 'docs,', minidoc_cnt, 'minidocs') df = pd.DataFrame(minidocs_info_list, columns=['mdid', 'doc_id', 'entity_name']) with open(WC_MINIDOC_INFO_FILE, 'w', encoding='utf-8', newline='\n') as fout: df.to_csv(fout, index=False)
def __filter_duplicate_minidocs(): df_minidocs = pd.read_csv(WC_MINIDOC_INFO_FILE) # print(df_minidocs.head()) all_doc_contents = commonutils.read_lines_to_list(WC_MINIDOC_TEXT_SEG_FILE) cv = textvectorizer.CountVectorizer((WC_DF_FILE, 100, 2000), remove_stopwords=True) print(cv.n_words, 'words in vocab') X = cv.get_vecs(all_doc_contents) n_docs = len(all_doc_contents) print(n_docs, 'docs', X.shape) dup_docs = set() for i, x1 in enumerate(X): cur_name = df_minidocs['entity_name'][i] # print(cur_name) if i % 100 == 0: print(i) # print(i) if i in dup_docs: continue for j in range(i + 1, n_docs): if j in dup_docs: continue sim = cosine_similarity(x1, X[j]) # if 0.8 < sim < 0.9: # print(i, j, sim) if sim > 0.9 and cur_name == df_minidocs['entity_name'][j]: # print(i, j, minidocs[i]['entity_name'], minidocs[j]['entity_name']) dup_docs.add(j) # if i == 3: # break # exit() dup_docs_list = list(dup_docs) dup_docs_list.sort() print(dup_docs_list[:30]) # TODO mdid not correct df_fil = df_minidocs.drop(dup_docs_list) with open(WC_MINIDOC_INFO_NODUP_FILE, 'w', encoding='utf-8', newline='\n') as fout: df_fil.to_csv(fout, index=False) commonutils.remove_lines(WC_MINIDOC_TEXT_FILE, dup_docs, WC_MINIDOC_TEXT_NODUP_FILE) commonutils.remove_lines(WC_MINIDOC_TEXT_SEG_FILE, dup_docs, WC_MINIDOC_TEXT_SEG_NODUP_FILE)
def __filter_duplicate_docs(): all_doc_contents = commonutils.read_lines_to_list(WC_SEG_DOC_CONTENT_FILE) cv = textvectorizer.CountVectorizer((WC_DF_FILE, 100, 2000), remove_stopwords=True) print(cv.n_words, 'words in vocab') X = cv.get_vecs(all_doc_contents) n_docs = len(all_doc_contents) print(n_docs, 'docs', X.shape) dup_docs = set() for i, x1 in enumerate(X): if i % 100 == 0: print(i) # print(i) if i in dup_docs: continue for j in range(i + 1, n_docs): if j in dup_docs: continue sim = cosine_similarity(x1, X[j]) # if 0.8 < sim < 0.9: # print(i, j, sim) if sim > 0.8: dup_docs.add(j) # if i == 5: # break # exit() doc_info_df = pd.read_csv(doc_file) dup_docs_list = list(dup_docs) dup_docs_list.sort() print(dup_docs_list[:30]) df_fil = doc_info_df.drop(dup_docs_list) with open(WC_DOC_INFO_NODUP_FILE, 'w', encoding='utf-8', newline='\n') as fout: df_fil.to_csv(fout, index=False) commonutils.remove_lines(WC_DOC_CONTENT_FILE, dup_docs, WC_DOC_CONTENT_NODUP_FILE) commonutils.remove_lines(WC_SEG_DOC_CONTENT_FILE, dup_docs, WC_SEG_DOC_CONTENT_NODUP_FILE)
def __gen_minidocs_with_specific_name(): df = pd.read_csv(WC_ENTITY_NAMES_FILE, header=None) name_doc_dict = wcdatautils.load_entity_name_to_minidoc_file( WC_MINIDOC_INFO_NODUP_FILE) for ch_name, en_name in df.itertuples(False, None): if en_name != 'swk': continue all_doc_contents = commonutils.read_lines_to_list( WC_MINIDOC_TEXT_NODUP_FILE) doc_idxs = name_doc_dict[ch_name] contents = [all_doc_contents[idx] for idx in doc_idxs] print(len(contents), 'docs') fout = open('d:/data/indec/entity-data/{}-mini.txt'.format(en_name), 'w', encoding='utf-8', newline='\n') for text in contents: fout.write('{}\n'.format(text.strip())) fout.close() break