コード例 #1
0
ファイル: distances.py プロジェクト: taohong08/HOTT
def wmd(p, q, C, truncate=None):
    """ Word mover's distance between distributions p and q with cost M."""
    if truncate is None:
        return sparse_ot(p, q, C)

    # Avoid changing p and q outside of this function
    p, q = np.copy(p), np.copy(q)

    to_0_p_idx = np.argsort(-p)[truncate:]
    p[to_0_p_idx] = 0
    to_0_q_idx = np.argsort(-q)[truncate:]
    q[to_0_q_idx] = 0

    return sparse_ot(p, q, C)
コード例 #2
0
ファイル: data.py プロジェクト: taohong08/HOTT
def loader(data_path,
           embeddings_path,
           p=1,
           K_lda=70,
           glove_embeddings=True,
           stemming=True,
           n_words_keep=20):
    """ Load dataset and embeddings from data path."""
    # Load dataset from data_path
    vocab, embed_vocab, bow_data, y = load_wmd_data(data_path)
    y = y - 1

    # Use GLOVE word embeddings
    if glove_embeddings:
        vocab, embed_vocab, bow_data = change_embeddings(
            vocab, bow_data, embeddings_path)
    # Reduce vocabulary by removing short words, stop words, and stemming
    if stemming:
        vocab, embed_vocab, bow_data = reduce_vocab(bow_data,
                                                    vocab,
                                                    embed_vocab,
                                                    embed_aggregate='mean')

    # Matrix of word embeddings
    embeddings = np.array([embed_vocab[w] for w in vocab])

    topics, lda_centers, topic_proportions = fit_topics(
        bow_data, embeddings, vocab, K_lda)

    cost_embeddings = euclidean_distances(embeddings, embeddings)**p
    cost_topics = np.zeros((topics.shape[0], topics.shape[0]))

    ## Reduce topics to top-20 words
    if n_words_keep is not None:
        for k in range(K_lda):
            to_0_idx = np.argsort(-topics[k])[n_words_keep:]
            topics[k][to_0_idx] = 0

    for i in range(cost_topics.shape[0]):
        for j in range(i + 1, cost_topics.shape[1]):
            cost_topics[i, j] = sparse_ot(topics[i], topics[j],
                                          cost_embeddings)
    cost_topics = cost_topics + cost_topics.T

    out = {
        'X': bow_data,
        'y': y,
        'embeddings': embeddings,
        'topics': topics,
        'proportions': topic_proportions,
        'cost_E': cost_embeddings,
        'cost_T': cost_topics
    }

    return out
コード例 #3
0
def load_data(df, embed_path, stemming = True, K=70, p=1, n_word_keep = 20):

    data, y = transform_dataframe(df)
    y = y - 1

    if not stemming :
        vocab, embed_vocab, vocab_, vocab_count, bow_data = gen_data(data, embed_path)

    if stemming :
        vocab1, embed_vocab1, vocab_, vocab_count, bow_data1 = gen_data(data, embed_path)
        vocab, embed_vocab, bow_data = reduce_vocab(vocab1, embed_vocab1, bow_data1, embed_aggregate='mean') 

    embeddings = np.array([embed_vocab[w] for w in vocab])

    topics, lda_centers, topic_proportions, topics_words = fit_topics(
        bow_data, embeddings, vocab, K)

    cost_embeddings = euclidean_distances(embeddings, embeddings) ** p
    cost_topics = np.zeros((topics.shape[0], topics.shape[0]))

    for k in range(K):
        to_0_idx = np.argsort(-topics[k])[n_word_keep:]
        topics[k][to_0_idx] = 0

    for i in range(cost_topics.shape[0]):
        for j in range(i + 1, cost_topics.shape[1]):
            cost_topics[i, j] = sparse_ot(topics[i], topics[j], cost_embeddings)
    cost_topics = cost_topics + cost_topics.T

    out = {'X': bow_data, 'y': y,
            'text' : vocab_,
           'embeddings': embeddings,
           'topics': topics, 'proportions': topic_proportions, 'topic_words' : topics_words,
           'cost_E': cost_embeddings, 'cost_T': cost_topics}

    return out
コード例 #4
0
def load_data(df,
              embed_path,
              stemming=True,
              K=70,
              p=1,
              n_word_keep=20,
              section='newDesk',
              balance=False):

    if section == 'sectionName':
        if balance:
            data, y, lib = transform_dataframe(df,
                                               section='sectionName',
                                               balance=True)
        else:
            data, y, lib = transform_dataframe(df, section='sectionName')

    if section == 'newDesk':
        if balance:
            data, y, lib = transform_dataframe(df,
                                               section='newDesk',
                                               balance=True)
        else:
            data, y, lib = transform_dataframe(df, section='newDesk')

    y = y - 1

    if not stemming:
        vocab, embed_vocab, bow_data = gen_data(data, embed_path)

    if stemming:

        vocab1, embed_vocab1, bow_data1 = gen_data(data, embed_path)
        print("stemming")
        vocab, embed_vocab, bow_data = reduce_vocab(vocab1,
                                                    embed_vocab1,
                                                    bow_data1,
                                                    embed_aggregate='mean')

    embeddings = np.array([embed_vocab[w] for w in vocab])

    print("computing LDA")
    topics, lda_centers, topic_proportions, topics_words = fit_topics(
        bow_data, embeddings, vocab, K)

    print("computing distance")
    cost_embeddings = euclidean_distances(embeddings, embeddings)**p
    cost_topics = np.zeros((topics.shape[0], topics.shape[0]))

    for k in range(K):
        to_0_idx = np.argsort(-topics[k])[n_word_keep:]
        topics[k][to_0_idx] = 0

    print("computing optimal transport calculation")
    for i in range(cost_topics.shape[0]):
        for j in range(i + 1, cost_topics.shape[1]):
            cost_topics[i, j] = sparse_ot(topics[i], topics[j],
                                          cost_embeddings)
    cost_topics = cost_topics + cost_topics.T

    out = {
        'vocab': vocab,
        'X': bow_data,
        'y': y,
        'lib': lib,
        'text': data,
        'embeddings': embeddings,
        'topics': topics,
        'proportions': topic_proportions,
        'topic_words': topics_words,
        'cost_E': cost_embeddings,
        'cost_T': cost_topics
    }

    return out