コード例 #1
0
ファイル: wcprep.py プロジェクト: hldai/indec
def __gen_minidocs_new():
    entity_names = commonutils.read_lines_to_list(entity_names_file)
    entity_names = ['白起']

    minidocs_info_list = list()
    fout_text = open('d:/data/indec/docs-14k-minidocs-text.txt',
                     'w',
                     encoding='utf-8',
                     newline='\n')
    fout_seg_text = open('d:/data/indec/docs-14k-minidocs-text-seg.txt',
                         'w',
                         encoding='utf-8',
                         newline='\n')
    for i, entity_name in enumerate(entity_names):
        info_list_tmp = __minidocs_for_name(entity_name, fout_text,
                                            fout_seg_text)
        minidocs_info_list += info_list_tmp
        # if i == 1:
        #     break

    fout_text.close()
    fout_seg_text.close()
    minidocs_info_list = [(i, doc_id, name)
                          for i, (doc_id,
                                  name) in enumerate(minidocs_info_list)]
    with open('d:/data/indec/docs-14k-minidocs-info.txt',
              'w',
              encoding='utf-8',
              newline='\n') as fout:
        pd.DataFrame(minidocs_info_list,
                     columns=['mdid', 'doc_id',
                              'entity_name']).to_csv(fout, index=False)
コード例 #2
0
ファイル: wcprep.py プロジェクト: hldai/indec
def __gen_minidocs():
    entity_names = commonutils.read_lines_to_list(entity_names_file)
    f = open(WC_SENT_FILE, encoding='utf-8')
    fout_text = open(WC_MINIDOC_TEXT_FILE, 'w', encoding='utf-8', newline='\n')
    n_context_sents = 2
    doc_cnt, minidoc_cnt = 0, 0
    minidocs_info_list = list()
    while True:
        doc_sents = __read_doc_sents(f)
        if doc_sents is None:
            break
        for name in entity_names:
            i = 0
            while i < len(doc_sents):
                sent = doc_sents[i]
                if name not in sent:
                    i += 1
                    continue
                s_idx_beg = max(i - n_context_sents, 0)
                p = i + 1
                max_hit_pos = i
                while p < len(doc_sents):
                    if name in doc_sents[p]:
                        max_hit_pos = p
                    if p - max_hit_pos >= n_context_sents * 2:
                        break
                    p += 1
                i = p + 1
                s_idx_end = min(max_hit_pos + n_context_sents + 1,
                                len(doc_sents))

                minidoc_text = ''.join(doc_sents[s_idx_beg:s_idx_end])
                minidocs_info_list.append((minidoc_cnt, doc_cnt, name))
                # minidoc = {'mdid': minidoc_cnt, 'doc_id': doc_cnt, 'text': minidoc_text, 'entity_name': name}
                minidoc_cnt += 1
                # fout.write('{}\n'.format(json.dumps(minidoc, ensure_ascii=False)))
                fout_text.write('{}\n'.format(minidoc_text))
                # print(name, cnt)
                # for s in doc_sents[s_idx_beg:s_idx_end]:
                #     print(s)
                # print(doc_sents[s_idx_beg:s_idx_end])
                # print()
        doc_cnt += 1
        # if doc_cnt > 10:
        #     break
        if doc_cnt % 1000 == 0:
            print(doc_cnt)
    f.close()
    # fout.close()
    fout_text.close()
    print(doc_cnt, 'docs,', minidoc_cnt, 'minidocs')
    df = pd.DataFrame(minidocs_info_list,
                      columns=['mdid', 'doc_id', 'entity_name'])
    with open(WC_MINIDOC_INFO_FILE, 'w', encoding='utf-8',
              newline='\n') as fout:
        df.to_csv(fout, index=False)
コード例 #3
0
ファイル: wcprep.py プロジェクト: hldai/indec
def __filter_duplicate_minidocs():
    df_minidocs = pd.read_csv(WC_MINIDOC_INFO_FILE)
    # print(df_minidocs.head())
    all_doc_contents = commonutils.read_lines_to_list(WC_MINIDOC_TEXT_SEG_FILE)
    cv = textvectorizer.CountVectorizer((WC_DF_FILE, 100, 2000),
                                        remove_stopwords=True)
    print(cv.n_words, 'words in vocab')
    X = cv.get_vecs(all_doc_contents)
    n_docs = len(all_doc_contents)
    print(n_docs, 'docs', X.shape)
    dup_docs = set()
    for i, x1 in enumerate(X):
        cur_name = df_minidocs['entity_name'][i]
        # print(cur_name)
        if i % 100 == 0:
            print(i)
        # print(i)

        if i in dup_docs:
            continue

        for j in range(i + 1, n_docs):
            if j in dup_docs:
                continue
            sim = cosine_similarity(x1, X[j])
            # if 0.8 < sim < 0.9:
            #     print(i, j, sim)
            if sim > 0.9 and cur_name == df_minidocs['entity_name'][j]:
                # print(i, j, minidocs[i]['entity_name'], minidocs[j]['entity_name'])
                dup_docs.add(j)

        # if i == 3:
        #     break

    # exit()
    dup_docs_list = list(dup_docs)
    dup_docs_list.sort()
    print(dup_docs_list[:30])

    # TODO mdid not correct
    df_fil = df_minidocs.drop(dup_docs_list)
    with open(WC_MINIDOC_INFO_NODUP_FILE, 'w', encoding='utf-8',
              newline='\n') as fout:
        df_fil.to_csv(fout, index=False)

    commonutils.remove_lines(WC_MINIDOC_TEXT_FILE, dup_docs,
                             WC_MINIDOC_TEXT_NODUP_FILE)
    commonutils.remove_lines(WC_MINIDOC_TEXT_SEG_FILE, dup_docs,
                             WC_MINIDOC_TEXT_SEG_NODUP_FILE)
コード例 #4
0
ファイル: wcprep.py プロジェクト: hldai/indec
def __filter_duplicate_docs():
    all_doc_contents = commonutils.read_lines_to_list(WC_SEG_DOC_CONTENT_FILE)
    cv = textvectorizer.CountVectorizer((WC_DF_FILE, 100, 2000),
                                        remove_stopwords=True)
    print(cv.n_words, 'words in vocab')
    X = cv.get_vecs(all_doc_contents)
    n_docs = len(all_doc_contents)
    print(n_docs, 'docs', X.shape)
    dup_docs = set()
    for i, x1 in enumerate(X):
        if i % 100 == 0:
            print(i)
        # print(i)

        if i in dup_docs:
            continue

        for j in range(i + 1, n_docs):
            if j in dup_docs:
                continue
            sim = cosine_similarity(x1, X[j])
            # if 0.8 < sim < 0.9:
            #     print(i, j, sim)
            if sim > 0.8:
                dup_docs.add(j)

        # if i == 5:
        #     break

    # exit()
    doc_info_df = pd.read_csv(doc_file)
    dup_docs_list = list(dup_docs)
    dup_docs_list.sort()
    print(dup_docs_list[:30])
    df_fil = doc_info_df.drop(dup_docs_list)
    with open(WC_DOC_INFO_NODUP_FILE, 'w', encoding='utf-8',
              newline='\n') as fout:
        df_fil.to_csv(fout, index=False)

    commonutils.remove_lines(WC_DOC_CONTENT_FILE, dup_docs,
                             WC_DOC_CONTENT_NODUP_FILE)
    commonutils.remove_lines(WC_SEG_DOC_CONTENT_FILE, dup_docs,
                             WC_SEG_DOC_CONTENT_NODUP_FILE)
コード例 #5
0
ファイル: wcprep.py プロジェクト: hldai/indec
def __gen_minidocs_with_specific_name():
    df = pd.read_csv(WC_ENTITY_NAMES_FILE, header=None)
    name_doc_dict = wcdatautils.load_entity_name_to_minidoc_file(
        WC_MINIDOC_INFO_NODUP_FILE)
    for ch_name, en_name in df.itertuples(False, None):
        if en_name != 'swk':
            continue

        all_doc_contents = commonutils.read_lines_to_list(
            WC_MINIDOC_TEXT_NODUP_FILE)
        doc_idxs = name_doc_dict[ch_name]
        contents = [all_doc_contents[idx] for idx in doc_idxs]
        print(len(contents), 'docs')
        fout = open('d:/data/indec/entity-data/{}-mini.txt'.format(en_name),
                    'w',
                    encoding='utf-8',
                    newline='\n')
        for text in contents:
            fout.write('{}\n'.format(text.strip()))
        fout.close()

        break