Beispiel #1
0
def __topdmm_wc(name, dst_vocab_file, dst_topics_file):
    all_doc_contents = utils.read_lines_to_list(WC_SEG_DOC_CONTENT_NODUP_FILE)
    name_doc_dict = utils.load_entity_name_to_doc_file(WC_NAME_DOC_ND_FILE)
    doc_idxs = name_doc_dict[name]
    contents = [all_doc_contents[idx] for idx in doc_idxs]
    print(len(contents), 'docs')

    docs_words = [content.split(' ') for content in contents]
    words_exist = utils.get_word_set(docs_words)
    extra_exclude_words = {name}
    if name == '姜子牙':
        extra_exclude_words = {'姜', '子牙'}
    cv = textvectorizer.CountVectorizer(
        (WC_DF_ND_FILE, 20, 700),
        remove_stopwords=True,
        words_exist=words_exist,
        extra_exclude_words=extra_exclude_words)
    print(len(cv.vocab), 'words in vocab')
    X = cv.get_vecs(contents, normalize=False)
    # D_codoc = utils.get_codoc_matrix(cv.vocab, contents)

    n_topic_words_disp = 10
    print('starting training ...')
    # for k in range(10, 11):
    k = 10
    dmm = TOPDMM(k, 100, alpha=0.01, beta=0.01, n_top=-1)
    dmm.fit(X)
    for t in dmm.topic_word_:
        widxs = np.argpartition(-t,
                                range(n_topic_words_disp))[:n_topic_words_disp]
        topic_words = [cv.vocab[i] for i in widxs]
        print(' '.join(topic_words))

    dmm.save(cv.vocab, dst_vocab_file, dst_topics_file)
Beispiel #2
0
def process_quora():
    name = 'DC'
    all_doc_contents = utils.read_lines_to_list(QUORA_ANSWER_TOK_LOWER_FILE)
    name_doc_dict = utils.load_entity_name_to_doc_file(QUORA_NAME_DOC_FILE)
    doc_idxs = name_doc_dict[name]
    contents = [all_doc_contents[idx] for idx in doc_idxs]
    docs_words = [content.split(' ') for content in contents]
    words_exist = utils.get_word_set(docs_words)
    cv = textvectorizer.CountVectorizer((QUORA_DF_FILE, 50, 6000),
                                        remove_stopwords=True,
                                        words_exist=words_exist)
    print(len(cv.vocab), 'words in vocab')
    word_idfs = [
        np.log(QUORA_NUM_TOTAL_DOCS / cv.word_cnts[w]) for w in cv.vocab
    ]

    docs = list()
    for words in docs_words:
        doc = list()
        for w in words:
            widx = cv.word_dict.get(w, -1)
            if widx > -1:
                doc.append(widx)
        docs.append(doc)

    return docs, cv.vocab, word_idfs
Beispiel #3
0
def __run_quora():
    name = 'DC'
    all_doc_contents = utils.read_lines_to_list(QUORA_ANSWER_TOK_LOWER_FILE)
    name_doc_dict = utils.load_entity_name_to_doc_file(QUORA_NAME_DOC_FILE)
    doc_idxs = name_doc_dict[name]
    contents = [all_doc_contents[idx] for idx in doc_idxs]
    docs_words = [content.split(' ') for content in contents]
    words_exist = utils.get_word_set(docs_words)
    cv = textvectorizer.CountVectorizer((QUORA_DF_FILE, 100, 5000),
                                        remove_stopwords=True,
                                        words_exist=words_exist)
    print(len(cv.vocab), 'words in vocab')
    X = cv.get_vecs(contents)

    parser = argparse.ArgumentParser()
    parser.add_argument("-lamb", "--lamb", type=float)
    parser.add_argument("-N", "--N", type=int)
    args = parser.parse_args()
    lamb = 1 if args.lamb is None else args.lamb
    N = 10 if args.N is None else args.N

    dst_file = os.path.join(QUORA_DATA_DIR, 'dpmfs_z_{}.txt'.format(lamb))
    print(dst_file)

    n_docs = len(doc_idxs)
    dpmfs = DPMFS(cv.n_words, N=N, n_docs=n_docs, n_iter=1000, lamb=lamb)
    dpmfs.fit(X, dst_file, cv.vocab)
    np.savetxt(dst_file, dpmfs.z, fmt='%d')
Beispiel #4
0
def __topdmm_wc_minidocs(name, dst_vocab_file, dst_topics_file):
    # all_doc_contents = utils.read_lines_to_list(WC_MINIDOC_TEXT_SEG_NODUP_FILE)
    # name_doc_dict = utils.load_entity_name_to_minidoc_file(WC_MINIDOC_INFO_NODUP_FILE)
    all_doc_contents = utils.read_lines_to_list(
        'd:/data/indec/docs-14k-minidocs-text-seg-new.txt')
    name_doc_dict = utils.load_entity_name_to_minidoc_file(
        'd:/data/indec/docs-14k-minidocs-info-new.txt')
    doc_idxs = name_doc_dict[name]
    contents = [all_doc_contents[idx] for idx in doc_idxs]
    # print(max(doc_idxs), len(all_doc_contents))
    print(len(contents), 'docs')

    common_words = utils.read_lines_to_list(COMMON_CH_WORDS_FILE)

    docs_words = [content.split(' ') for content in contents]
    words_exist = utils.get_word_set(docs_words)
    extra_exclude_words = set(common_words)
    extra_exclude_words.add(name)
    # extra_exclude_words = {name}
    if name == '姜子牙':
        extra_exclude_words.add('姜')
        extra_exclude_words.add('子牙')
    if name == '夏侯惇':
        extra_exclude_words.add('夏侯')
        extra_exclude_words.add('惇')
    cv = textvectorizer.CountVectorizer(
        (WC_DF_ND_FILE, 20, 700),
        remove_stopwords=True,
        words_exist=words_exist,
        extra_exclude_words=extra_exclude_words)
    print(len(cv.vocab), 'words in vocab')
    # print('吃' in cv.vocab)
    # exit()
    X = cv.get_vecs(contents, normalize=False)
    # D_codoc = utils.get_codoc_matrix(cv.vocab, contents)

    n_topic_words_disp = 10
    print('starting training ...')
    # for k in range(10, 11):
    k = 10
    dmm = TOPDMM(k, 80, alpha=0.01, beta=0.01, n_top=-1)
    dmm.fit(X)
    for t in dmm.topic_word_:
        widxs = np.argpartition(-t,
                                range(n_topic_words_disp))[:n_topic_words_disp]
        topic_words = [cv.vocab[i] for i in widxs]
        print(' '.join(topic_words))

    dmm.save(cv.vocab, dst_vocab_file, dst_topics_file)
Beispiel #5
0
def __filter_duplicate_minidocs():
    df_minidocs = pd.read_csv(WC_MINIDOC_INFO_FILE)
    # print(df_minidocs.head())
    all_doc_contents = commonutils.read_lines_to_list(WC_MINIDOC_TEXT_SEG_FILE)
    cv = textvectorizer.CountVectorizer((WC_DF_FILE, 100, 2000),
                                        remove_stopwords=True)
    print(cv.n_words, 'words in vocab')
    X = cv.get_vecs(all_doc_contents)
    n_docs = len(all_doc_contents)
    print(n_docs, 'docs', X.shape)
    dup_docs = set()
    for i, x1 in enumerate(X):
        cur_name = df_minidocs['entity_name'][i]
        # print(cur_name)
        if i % 100 == 0:
            print(i)
        # print(i)

        if i in dup_docs:
            continue

        for j in range(i + 1, n_docs):
            if j in dup_docs:
                continue
            sim = cosine_similarity(x1, X[j])
            # if 0.8 < sim < 0.9:
            #     print(i, j, sim)
            if sim > 0.9 and cur_name == df_minidocs['entity_name'][j]:
                # print(i, j, minidocs[i]['entity_name'], minidocs[j]['entity_name'])
                dup_docs.add(j)

        # if i == 3:
        #     break

    # exit()
    dup_docs_list = list(dup_docs)
    dup_docs_list.sort()
    print(dup_docs_list[:30])

    # TODO mdid not correct
    df_fil = df_minidocs.drop(dup_docs_list)
    with open(WC_MINIDOC_INFO_NODUP_FILE, 'w', encoding='utf-8',
              newline='\n') as fout:
        df_fil.to_csv(fout, index=False)

    commonutils.remove_lines(WC_MINIDOC_TEXT_FILE, dup_docs,
                             WC_MINIDOC_TEXT_NODUP_FILE)
    commonutils.remove_lines(WC_MINIDOC_TEXT_SEG_FILE, dup_docs,
                             WC_MINIDOC_TEXT_SEG_NODUP_FILE)
Beispiel #6
0
def __process_quora():
    cv = textvectorizer.CountVectorizer(QUORA_DF_FILE, 50, 10000, True)
    print(cv.n_words, 'words in vocabulary')

    name = 'DC'
    all_doc_contents = utils.read_lines_to_list(QUORA_ANSWER_TOK_LOWER_FILE)
    name_doc_dict = utils.load_entity_name_to_doc_file(QUORA_NAME_DOC_FILE)
    doc_idxs = name_doc_dict[name]
    contents = [all_doc_contents[idx] for idx in doc_idxs]
    X = cv.get_vecs(contents)

    k = 10
    lda = LatentDirichletAllocation(k,
                                    learning_method='batch',
                                    doc_topic_prior=.1,
                                    topic_word_prior=0.01)
    X_new = lda.fit_transform(X)
    # for t in lda.components_:
    #     max_word_idxs = np.argpartition(-t, np.arange(10))[:10]
    #     for idx in max_word_idxs:
    #         print(cv.vocab[idx], end=' ')
    #     print()

    topic_cnts = {i: 0 for i in range(k)}
    for i, x in enumerate(X_new):
        max_topic_idxs = np.argpartition(-x, np.arange(3))[:3]
        topic_cnts[max_topic_idxs[0]] += 1
        # print(i + 1)
        # for tidx in max_topic_idxs:
        #     topic_dist = lda.components_[tidx]
        #     max_word_idxs = np.argpartition(-topic_dist, np.arange(10))[:10]
        #     topic_words = [cv.vocab[idx] for idx in max_word_idxs]
        #     print(x[tidx], ' '.join(topic_words))
        # print()
        # if i == 50:
        #     break
    for tidx, cnt in topic_cnts.items():
        print(tidx, cnt)
        max_word_idxs = np.argpartition(-lda.components_[tidx],
                                        np.arange(10))[:10]
        for idx in max_word_idxs:
            print('{}*{:.3f}'.format(cv.vocab[idx],
                                     lda.components_[tidx][idx]),
                  end=' ')
        print()
Beispiel #7
0
def __filter_duplicate_docs():
    all_doc_contents = commonutils.read_lines_to_list(WC_SEG_DOC_CONTENT_FILE)
    cv = textvectorizer.CountVectorizer((WC_DF_FILE, 100, 2000),
                                        remove_stopwords=True)
    print(cv.n_words, 'words in vocab')
    X = cv.get_vecs(all_doc_contents)
    n_docs = len(all_doc_contents)
    print(n_docs, 'docs', X.shape)
    dup_docs = set()
    for i, x1 in enumerate(X):
        if i % 100 == 0:
            print(i)
        # print(i)

        if i in dup_docs:
            continue

        for j in range(i + 1, n_docs):
            if j in dup_docs:
                continue
            sim = cosine_similarity(x1, X[j])
            # if 0.8 < sim < 0.9:
            #     print(i, j, sim)
            if sim > 0.8:
                dup_docs.add(j)

        # if i == 5:
        #     break

    # exit()
    doc_info_df = pd.read_csv(doc_file)
    dup_docs_list = list(dup_docs)
    dup_docs_list.sort()
    print(dup_docs_list[:30])
    df_fil = doc_info_df.drop(dup_docs_list)
    with open(WC_DOC_INFO_NODUP_FILE, 'w', encoding='utf-8',
              newline='\n') as fout:
        df_fil.to_csv(fout, index=False)

    commonutils.remove_lines(WC_DOC_CONTENT_FILE, dup_docs,
                             WC_DOC_CONTENT_NODUP_FILE)
    commonutils.remove_lines(WC_SEG_DOC_CONTENT_FILE, dup_docs,
                             WC_SEG_DOC_CONTENT_NODUP_FILE)
Beispiel #8
0
def __run_with_quora():
    name = 'DC'
    # name = 'WP'
    # name = 'Austin'
    # name = 'Mark'
    all_doc_contents = utils.read_lines_to_list(QUORA_ANSWER_TOK_LOWER_FILE)
    name_doc_dict = utils.load_entity_name_to_doc_file(QUORA_NAME_DOC_FILE)
    doc_idxs = name_doc_dict[name]
    contents = [all_doc_contents[idx] for idx in doc_idxs]
    docs_words = [content.split(' ') for content in contents]
    words_exist = utils.get_word_set(docs_words)
    cv = textvectorizer.CountVectorizer((QUORA_DF_FILE, 50, 6000),
                                        remove_stopwords=True,
                                        words_exist=words_exist)
    print(len(cv.vocab), 'words in vocab')
    X = cv.get_vecs(contents, normalize=False)
    # D_codoc = utils.get_codoc_matrix(cv.vocab, contents)

    # k = 3
    n_topic_words_disp = 10
    for k in range(10, 11):
        dmm = TOPDMM(k, 100, alpha=0.01, beta=0.01)
        dmm.fit(X)
        for t in dmm.topic_word_:
            widxs = np.argpartition(
                -t, range(n_topic_words_disp))[:n_topic_words_disp]
            topic_words = [cv.vocab[i] for i in widxs]
            print(' '.join(topic_words))

        # __show_coherences(k, dmm.topic_word_, D_codoc)

        test_vocab_file = os.path.join(QUORA_DATA_DIR,
                                       '{}_vocab.txt'.format(name))
        test_topic_file = os.path.join(QUORA_DATA_DIR,
                                       '{}_topics.txt'.format(name))
        dmm.save(cv.vocab, test_vocab_file, test_topic_file)
Beispiel #9
0
def __process_quora():
    name = 'DC'
    # name = 'Mark'
    all_doc_contents = utils.read_lines_to_list(QUORA_ANSWER_TOK_LOWER_FILE)
    name_doc_dict = utils.load_entity_name_to_doc_file(QUORA_NAME_DOC_FILE)
    doc_idxs = name_doc_dict[name]
    contents = [all_doc_contents[idx] for idx in doc_idxs]
    docs_words = [content.split(' ') for content in contents]
    words_exist = utils.get_word_set(docs_words)
    cv = textvectorizer.CountVectorizer((QUORA_DF_FILE, 50, 6000),
                                        remove_stopwords=True,
                                        words_exist=words_exist)
    print(len(cv.vocab), 'words in vocab')
    X = cv.get_vecs(contents, normalize=True)
    print(X.shape)

    k = 10
    tsvd = TruncatedSVD(n_components=k)
    X_new = tsvd.fit_transform(X)
    for i in range(k):
        max_idxs = np.argpartition(-tsvd.components_[i], range(20))[:20]
        words = [cv.vocab[idx] for idx in max_idxs]
        print(tsvd.explained_variance_[i], tsvd.singular_values_[i])
        print(words)