コード例 #1
0
ファイル: topdmm.py プロジェクト: hldai/indec
def __topdmm_wc_minidocs(name, dst_vocab_file, dst_topics_file):
    # all_doc_contents = utils.read_lines_to_list(WC_MINIDOC_TEXT_SEG_NODUP_FILE)
    # name_doc_dict = utils.load_entity_name_to_minidoc_file(WC_MINIDOC_INFO_NODUP_FILE)
    all_doc_contents = utils.read_lines_to_list(
        'd:/data/indec/docs-14k-minidocs-text-seg-new.txt')
    name_doc_dict = utils.load_entity_name_to_minidoc_file(
        'd:/data/indec/docs-14k-minidocs-info-new.txt')
    doc_idxs = name_doc_dict[name]
    contents = [all_doc_contents[idx] for idx in doc_idxs]
    # print(max(doc_idxs), len(all_doc_contents))
    print(len(contents), 'docs')

    common_words = utils.read_lines_to_list(COMMON_CH_WORDS_FILE)

    docs_words = [content.split(' ') for content in contents]
    words_exist = utils.get_word_set(docs_words)
    extra_exclude_words = set(common_words)
    extra_exclude_words.add(name)
    # extra_exclude_words = {name}
    if name == '姜子牙':
        extra_exclude_words.add('姜')
        extra_exclude_words.add('子牙')
    if name == '夏侯惇':
        extra_exclude_words.add('夏侯')
        extra_exclude_words.add('惇')
    cv = textvectorizer.CountVectorizer(
        (WC_DF_ND_FILE, 20, 700),
        remove_stopwords=True,
        words_exist=words_exist,
        extra_exclude_words=extra_exclude_words)
    print(len(cv.vocab), 'words in vocab')
    # print('吃' in cv.vocab)
    # exit()
    X = cv.get_vecs(contents, normalize=False)
    # D_codoc = utils.get_codoc_matrix(cv.vocab, contents)

    n_topic_words_disp = 10
    print('starting training ...')
    # for k in range(10, 11):
    k = 10
    dmm = TOPDMM(k, 80, alpha=0.01, beta=0.01, n_top=-1)
    dmm.fit(X)
    for t in dmm.topic_word_:
        widxs = np.argpartition(-t,
                                range(n_topic_words_disp))[:n_topic_words_disp]
        topic_words = [cv.vocab[i] for i in widxs]
        print(' '.join(topic_words))

    dmm.save(cv.vocab, dst_vocab_file, dst_topics_file)
コード例 #2
0
ファイル: urnlda.py プロジェクト: hldai/indec
def process_quora():
    name = 'DC'
    all_doc_contents = utils.read_lines_to_list(QUORA_ANSWER_TOK_LOWER_FILE)
    name_doc_dict = utils.load_entity_name_to_doc_file(QUORA_NAME_DOC_FILE)
    doc_idxs = name_doc_dict[name]
    contents = [all_doc_contents[idx] for idx in doc_idxs]
    docs_words = [content.split(' ') for content in contents]
    words_exist = utils.get_word_set(docs_words)
    cv = textvectorizer.CountVectorizer((QUORA_DF_FILE, 50, 6000),
                                        remove_stopwords=True,
                                        words_exist=words_exist)
    print(len(cv.vocab), 'words in vocab')
    word_idfs = [
        np.log(QUORA_NUM_TOTAL_DOCS / cv.word_cnts[w]) for w in cv.vocab
    ]

    docs = list()
    for words in docs_words:
        doc = list()
        for w in words:
            widx = cv.word_dict.get(w, -1)
            if widx > -1:
                doc.append(widx)
        docs.append(doc)

    return docs, cv.vocab, word_idfs
コード例 #3
0
ファイル: topdmm.py プロジェクト: hldai/indec
def __topdmm_wc(name, dst_vocab_file, dst_topics_file):
    all_doc_contents = utils.read_lines_to_list(WC_SEG_DOC_CONTENT_NODUP_FILE)
    name_doc_dict = utils.load_entity_name_to_doc_file(WC_NAME_DOC_ND_FILE)
    doc_idxs = name_doc_dict[name]
    contents = [all_doc_contents[idx] for idx in doc_idxs]
    print(len(contents), 'docs')

    docs_words = [content.split(' ') for content in contents]
    words_exist = utils.get_word_set(docs_words)
    extra_exclude_words = {name}
    if name == '姜子牙':
        extra_exclude_words = {'姜', '子牙'}
    cv = textvectorizer.CountVectorizer(
        (WC_DF_ND_FILE, 20, 700),
        remove_stopwords=True,
        words_exist=words_exist,
        extra_exclude_words=extra_exclude_words)
    print(len(cv.vocab), 'words in vocab')
    X = cv.get_vecs(contents, normalize=False)
    # D_codoc = utils.get_codoc_matrix(cv.vocab, contents)

    n_topic_words_disp = 10
    print('starting training ...')
    # for k in range(10, 11):
    k = 10
    dmm = TOPDMM(k, 100, alpha=0.01, beta=0.01, n_top=-1)
    dmm.fit(X)
    for t in dmm.topic_word_:
        widxs = np.argpartition(-t,
                                range(n_topic_words_disp))[:n_topic_words_disp]
        topic_words = [cv.vocab[i] for i in widxs]
        print(' '.join(topic_words))

    dmm.save(cv.vocab, dst_vocab_file, dst_topics_file)
コード例 #4
0
ファイル: dpmfs.py プロジェクト: hldai/indec
def __run_quora():
    name = 'DC'
    all_doc_contents = utils.read_lines_to_list(QUORA_ANSWER_TOK_LOWER_FILE)
    name_doc_dict = utils.load_entity_name_to_doc_file(QUORA_NAME_DOC_FILE)
    doc_idxs = name_doc_dict[name]
    contents = [all_doc_contents[idx] for idx in doc_idxs]
    docs_words = [content.split(' ') for content in contents]
    words_exist = utils.get_word_set(docs_words)
    cv = textvectorizer.CountVectorizer((QUORA_DF_FILE, 100, 5000),
                                        remove_stopwords=True,
                                        words_exist=words_exist)
    print(len(cv.vocab), 'words in vocab')
    X = cv.get_vecs(contents)

    parser = argparse.ArgumentParser()
    parser.add_argument("-lamb", "--lamb", type=float)
    parser.add_argument("-N", "--N", type=int)
    args = parser.parse_args()
    lamb = 1 if args.lamb is None else args.lamb
    N = 10 if args.N is None else args.N

    dst_file = os.path.join(QUORA_DATA_DIR, 'dpmfs_z_{}.txt'.format(lamb))
    print(dst_file)

    n_docs = len(doc_idxs)
    dpmfs = DPMFS(cv.n_words, N=N, n_docs=n_docs, n_iter=1000, lamb=lamb)
    dpmfs.fit(X, dst_file, cv.vocab)
    np.savetxt(dst_file, dpmfs.z, fmt='%d')
コード例 #5
0
ファイル: urnlda.py プロジェクト: hldai/indec
def __check_topics():
    k = 10
    n_top = 10

    vocab = utils.read_lines_to_list(test_vocab_file)
    n_words = len(vocab)
    print(n_words, 'words')
    nzw = np.zeros((k, n_words), np.float32)
コード例 #6
0
def __process_quora():
    tfidf = textvectorizer.TfIdf(QUORA_DF_FILE, 10, 24600, 143479, True)
    all_doc_contents = utils.read_lines_to_list(QUORA_ANSWER_TOK_LOWER_FILE)

    eps = 0.7
    min_samples = 3

    name_doc_dict = utils.load_entity_name_to_doc_file(QUORA_NAME_DOC_FILE)
    doc_idxs = name_doc_dict['DC']
    contents = [all_doc_contents[idx] for idx in doc_idxs]
    __dbscan_docs(contents, tfidf, eps, min_samples, result_dir_quora)
コード例 #7
0
def __process_wechat():
    tfidf = textvectorizer.TfIdf(WC_DF_FILE, 5, 62000, 14357)
    all_doc_contents = utils.read_lines_to_list(WC_SEG_DOC_CONTENT_FILE)

    eps = 0.55
    min_samples = 20

    name_doc_dict = utils.load_entity_name_to_doc_file(WC_NAME_DOC_FILE)
    for name, doc_idxs in name_doc_dict.items():
        print(name)
        contents = [all_doc_contents[idx] for idx in doc_idxs]
        __dbscan_docs(contents, tfidf, eps, min_samples, result_dir_wc)
        break
コード例 #8
0
ファイル: edlda.py プロジェクト: hldai/indec
def __process_quora():
    cv = textvectorizer.CountVectorizer(QUORA_DF_FILE, 50, 10000, True)
    print(cv.n_words, 'words in vocabulary')

    name = 'DC'
    all_doc_contents = utils.read_lines_to_list(QUORA_ANSWER_TOK_LOWER_FILE)
    name_doc_dict = utils.load_entity_name_to_doc_file(QUORA_NAME_DOC_FILE)
    doc_idxs = name_doc_dict[name]
    contents = [all_doc_contents[idx] for idx in doc_idxs]
    X = cv.get_vecs(contents)

    k = 10
    lda = LatentDirichletAllocation(k,
                                    learning_method='batch',
                                    doc_topic_prior=.1,
                                    topic_word_prior=0.01)
    X_new = lda.fit_transform(X)
    # for t in lda.components_:
    #     max_word_idxs = np.argpartition(-t, np.arange(10))[:10]
    #     for idx in max_word_idxs:
    #         print(cv.vocab[idx], end=' ')
    #     print()

    topic_cnts = {i: 0 for i in range(k)}
    for i, x in enumerate(X_new):
        max_topic_idxs = np.argpartition(-x, np.arange(3))[:3]
        topic_cnts[max_topic_idxs[0]] += 1
        # print(i + 1)
        # for tidx in max_topic_idxs:
        #     topic_dist = lda.components_[tidx]
        #     max_word_idxs = np.argpartition(-topic_dist, np.arange(10))[:10]
        #     topic_words = [cv.vocab[idx] for idx in max_word_idxs]
        #     print(x[tidx], ' '.join(topic_words))
        # print()
        # if i == 50:
        #     break
    for tidx, cnt in topic_cnts.items():
        print(tidx, cnt)
        max_word_idxs = np.argpartition(-lda.components_[tidx],
                                        np.arange(10))[:10]
        for idx in max_word_idxs:
            print('{}*{:.3f}'.format(cv.vocab[idx],
                                     lda.components_[tidx][idx]),
                  end=' ')
        print()
コード例 #9
0
def __check_data():
    docs_file = 'd:/data/indec/title_content_new_entities-09-08.csv'
    names_file = 'd:/data/indec/ambig-names-from-wiki-wz-dhl.txt'
    names = utils.read_lines_to_list(names_file)

    name_docs_dict = {n: list() for n in names}
    f = open(docs_file, encoding='utf-8')
    next(f)
    for i, line in enumerate(f):
        p = line.find(',')
        title, content = line[:p], line[p + 1:]
        for name in names:
            if name in content:
                name_docs_dict[name].append(i)

        if '王刚' in content:
            print(content)
    f.close()
コード例 #10
0
ファイル: topdmm.py プロジェクト: hldai/indec
def __run_with_quora():
    name = 'DC'
    # name = 'WP'
    # name = 'Austin'
    # name = 'Mark'
    all_doc_contents = utils.read_lines_to_list(QUORA_ANSWER_TOK_LOWER_FILE)
    name_doc_dict = utils.load_entity_name_to_doc_file(QUORA_NAME_DOC_FILE)
    doc_idxs = name_doc_dict[name]
    contents = [all_doc_contents[idx] for idx in doc_idxs]
    docs_words = [content.split(' ') for content in contents]
    words_exist = utils.get_word_set(docs_words)
    cv = textvectorizer.CountVectorizer((QUORA_DF_FILE, 50, 6000),
                                        remove_stopwords=True,
                                        words_exist=words_exist)
    print(len(cv.vocab), 'words in vocab')
    X = cv.get_vecs(contents, normalize=False)
    # D_codoc = utils.get_codoc_matrix(cv.vocab, contents)

    # k = 3
    n_topic_words_disp = 10
    for k in range(10, 11):
        dmm = TOPDMM(k, 100, alpha=0.01, beta=0.01)
        dmm.fit(X)
        for t in dmm.topic_word_:
            widxs = np.argpartition(
                -t, range(n_topic_words_disp))[:n_topic_words_disp]
            topic_words = [cv.vocab[i] for i in widxs]
            print(' '.join(topic_words))

        # __show_coherences(k, dmm.topic_word_, D_codoc)

        test_vocab_file = os.path.join(QUORA_DATA_DIR,
                                       '{}_vocab.txt'.format(name))
        test_topic_file = os.path.join(QUORA_DATA_DIR,
                                       '{}_topics.txt'.format(name))
        dmm.save(cv.vocab, test_vocab_file, test_topic_file)
コード例 #11
0
ファイル: lsa.py プロジェクト: hldai/indec
def __process_quora():
    name = 'DC'
    # name = 'Mark'
    all_doc_contents = utils.read_lines_to_list(QUORA_ANSWER_TOK_LOWER_FILE)
    name_doc_dict = utils.load_entity_name_to_doc_file(QUORA_NAME_DOC_FILE)
    doc_idxs = name_doc_dict[name]
    contents = [all_doc_contents[idx] for idx in doc_idxs]
    docs_words = [content.split(' ') for content in contents]
    words_exist = utils.get_word_set(docs_words)
    cv = textvectorizer.CountVectorizer((QUORA_DF_FILE, 50, 6000),
                                        remove_stopwords=True,
                                        words_exist=words_exist)
    print(len(cv.vocab), 'words in vocab')
    X = cv.get_vecs(contents, normalize=True)
    print(X.shape)

    k = 10
    tsvd = TruncatedSVD(n_components=k)
    X_new = tsvd.fit_transform(X)
    for i in range(k):
        max_idxs = np.argpartition(-tsvd.components_[i], range(20))[:20]
        words = [cv.vocab[idx] for idx in max_idxs]
        print(tsvd.explained_variance_[i], tsvd.singular_values_[i])
        print(words)
コード例 #12
0
ファイル: topicmodel.py プロジェクト: hldai/indec
 def __init__(self, vocab_file, topic_file):
     self.vocab = utils.read_lines_to_list(vocab_file)
     df = pd.read_csv(topic_file, header=None)
     self.topics = df.as_matrix()
     self.n_topics = len(self.topics)