def main(we_file, w2i_file, use_brown=True, n_files=50):
    if use_brown:
        cc_matrix = "cc_matrix_brown.npy"
    else:
        cc_matrix = "cc_matrix_%s.npy" % n_files

    # hacky way of checking if we need to re-load the raw data or not
    # remember, only the co-occurrence matrix is needed for training
    if os.path.exists(cc_matrix):
        with open(w2i_file) as f:
            word2idx = json.load(f)
        sentences = [] # dummy - we won't actually use it
    else:
        if use_brown:
            keep_words = set([
                'king', 'man', 'woman',
                'france', 'paris', 'london', 'rome', 'italy', 'britain', 'england',
                'french', 'english', 'japan', 'japanese', 'chinese', 'italian',
                'australia', 'australian', 'december', 'november', 'june',
                'january', 'february', 'march', 'april', 'may', 'july', 'august',
                'september', 'october',
            ])
            sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=5000, keep_words=keep_words)
        else:
            sentences, word2idx = get_wikipedia_data(n_files=n_files, n_vocab=2000)
        
        with open(w2i_file, 'w') as f:
            json.dump(word2idx, f)

    V = len(word2idx)
    model = Glove(100, V, 10)
    model.fit(sentences, cc_matrix=cc_matrix, epochs=200)
    model.save(we_file)
Exemple #2
0
def main(we_file, w2i_file, use_brown=True, n_files=100):
    if use_brown:
        cc_matrix = "cc_matrix_brown.npy"
    else:
        cc_matrix = "cc_matrix_%s.npy" % n_files

    # hacky way of checking if we need to re-load the raw data or not
    # remember, only the co-occurrence matrix is needed for training
    if os.path.exists(cc_matrix):
        with open(w2i_file) as f:
            word2idx = json.load(f)
        sentences = [] # dummy - we won't actually use it
    else:
        if use_brown:
            keep_words = set([
                'king', 'man', 'woman',
                'france', 'paris', 'london', 'rome', 'italy', 'britain', 'england',
                'french', 'english', 'japan', 'japanese', 'chinese', 'italian',
                'australia', 'australian', 'december', 'november', 'june',
                'january', 'february', 'march', 'april', 'may', 'july', 'august',
                'september', 'october',
            ])
            sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=5000, keep_words=keep_words)
        else:
            sentences, word2idx = get_wikipedia_data(n_files=n_files, n_vocab=2000)
        
        with open(w2i_file, 'w') as f:
            json.dump(word2idx, f)

    V = len(word2idx)
    model = Glove(100, V, 10)

    # alternating least squares method
    model.fit(sentences, cc_matrix=cc_matrix)
    model.save(we_file)
Exemple #3
0
def main():

    sentences, word2idx = get_sentences_with_word2idx_limit_vocab(2000)
    V = len(word2idx)
    print("Vocab size:", V)

    start_idx = word2idx['START']
    end_idx = word2idx['END']

    estimator = SoftmaxANN(K=V, start_idx=start_idx, end_idx=end_idx)
    estimator.fit(sentences, D=V, epochs=1, lr=0.1, freq=50, is_sent=True)
def main():
    sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=1500)
    # sentences, word2idx = get_wikipedia_data(n_files=10, n_vocab=1500, by_paragraph=True)
    with open('w2v_word2idx.json', 'w') as f:
        json.dump(word2idx, f)

    # build term document matrix
    V = len(word2idx)
    N = len(sentences)

    # create raw counts first
    A = np.zeros((V, N))
    j = 0
    for sentence in sentences:
        for i in sentence:
            A[i, j] += 1
        j += 1
    print("finished getting raw counts")

    transformer = TfidfTransformer()
    A = transformer.fit_transform(A)
    # print "type(A):", type(A)
    # exit()
    A = A.toarray()

    idx2word = {v: k for k, v in word2idx.iteritems()}

    # plot the data in 2-D
    tsne = TSNE()
    Z = tsne.fit_transform(A)
    plt.scatter(Z[:, 0], Z[:, 1])
    for i in range(V):
        try:
            plt.annotate(s=idx2word[i].encode("utf8"), xy=(Z[i, 0], Z[i, 1]))
        except:
            print("bad string:", idx2word[i])
    plt.show()

    # create a higher-D word embedding, try word analogies
    # tsne = TSNE(n_components=3)
    # We = tsne.fit_transform(A)
    We = Z
    find_analogies('king', 'man', 'woman', We, word2idx)
    find_analogies('france', 'paris', 'london', We, word2idx)
    find_analogies('france', 'paris', 'rome', We, word2idx)
    find_analogies('paris', 'france', 'italy', We, word2idx)
def main(use_brown=True):
    if use_brown:
        sentences, word2idx = get_sentences_with_word2idx_limit_vocab()
        # sentences, word2idx = get_sentences_with_word2idx()
        # sentences, word2idx = get_text8()
    else:
        sentences, word2idx = get_wikipedia_data(n_files=1, n_vocab=2000)
    with open('w2v_word2idx.json', 'w') as f:
        json.dump(word2idx, f)

    V = len(word2idx)
    model = Model(50, V, 5)

    # use numpy
    # model.fit(sentences, learning_rate=1e-3, mu=0, epochs=5, num_neg_samples=5)

    # use theano
    model.fitt(sentences, learning_rate=1e-3, mu=0, epochs=5, num_neg_samples=5)

    model.save('w2v_model.npz')
Exemple #6
0
TF-IDF是NLP中用於將文本文檔列表轉換為矩陣表示的常用工具。
每個文檔被轉換為TF-IDF矩陣的row,
並且每個詞存儲在column中。詞彙量(或欄位數)的大小是應該指定的參數,
前5'000-10'000最常見詞的詞彙通常就足夠了。

TF-IDF是稀疏向量,其中向量表示中的非零值的數量總是等於文檔中的唯一單詞的數量。

在擬合期間,tf-idf函數發現語料庫中最常見的單詞並將其保存到詞彙表中。
通過計算詞彙表中每個單詞出現在文檔中的次數來轉換文檔。
因此,tf-idf矩陣將具有[Number_documents,Size_of_vocabulary]的形狀。
每個單詞的權重通過它在語料庫中出現的次數進行歸一化。

'''
### choose a data source ###
# get corpus(Brown) and word2idx
sentences, word2idx = get_sentences_with_word2idx_limit_vocab()
print('finished retriveing data')
print('vocab size:', len(word2idx), 'number of sentences:', len(sentences))

# print(sentences[15]) # 隨便抓一個句子看看 -> [13, 2000, 74, 48, 1578, 26, 19, 2000, 1276, 16, 91, 1143, 160, 2000, 38, 248, 20, 102, 218, 2000, 2000, 27, 15]

## build term document matrix,就像是CountVectorizer()做的事
V = len(word2idx)
N = len(sentences)
A = np.zeros((N, V))  # [Number_documents,Size_of_vocabulary]
print('N:', N, 'V:', V)

for i in range(N):
    for j in sentences[i]:
        A[i, j] += 1
print('finished build term document matrix A, A.shape:', A.shape)
from datetime import datetime

import os
import sys
sys.path.append(os.path.abspath('..'))
from rnn_class.util import get_wikipedia_data
from rnn_class.brown import get_sentences_with_word2idx_limit_vocab, get_sentences_with_word2idx

from markov import get_bigram_probs


if __name__ == '__main__':
  # load in the data
  # note: sentences are already converted to sequences of word indexes
  # note: you can limit the vocab size if you run out of memory
  sentences, word2idx = get_sentences_with_word2idx_limit_vocab(2000)
  # sentences, word2idx = get_sentences_with_word2idx()

  # vocab size
  V = len(word2idx)
  print("Vocab size:", V)

  # we will also treat beginning of sentence and end of sentence as bigrams
  # START -> first word
  # last word -> END
  start_idx = word2idx['START']
  end_idx = word2idx['END']


  # a matrix where:
  # row = last word
Exemple #8
0
def main(we_file, w2i_file, use_brown=True, n_files=100):
    if use_brown:
        cc_matrix = 'cc_matrix_brown.npy'
    else:
        cc_matrix = 'cc_matrix_%s.npy' % n_files

    # hacky way of checking if we need to re-load the raw data or not
    # remember only the co-occurrence matirx is needed for training
    if os.path.exists(cc_matrix):
        with open(w2i_file) as f:
            word2idx = json.load(f)  # 載入就變成 dict 型別囉
        sentences = []  # dummy - we won't actually use it
    else:
        if use_brown:
            keep_words = set([
                'king',
                'man',
                'woman',
                'france',
                'paris',
                'london',
                'rome',
                'italy',
                'britain',
                'england',
                'french',
                'english',
                'japan',
                'japanese',
                'chinese',
                'italian',
                'australia',
                'australian',
                'december',
                'november',
                'june',
                'january',
                'february',
                'march',
                'april',
                'may',
                'july',
                'august',
                'september',
                'october',
            ])
            sentences, word2idx = get_sentences_with_word2idx_limit_vocab(
                n_vocab=5000, keep_words=keep_words)
        else:
            # sentences, word2idx = get_wikipedia_data(n_files=n_files, n_vocab=2000)   # 我根本就不會用到這個
            pass

        with open(w2i_file, 'w') as f:
            json.dump(word2idx, f)

    V = len(word2idx)
    model = Glove(80, V, 10)

    # ALS method
    model.fit(sentences=sentences, cc_matrix=cc_matrix, epochs=20)

    # gradient descent method
    # model.fit(
    #     sentences,
    #     cc_matrix=cc_matrix,
    #     learning_rate=5e-4,
    #     reg=0.1,
    #     epochs=500,     # 注意,要500次才夠耶!!
    #     gd=True,
    # )

    model.save(we_file)
                bigram_probs[sentence[i], end_idx] += 1

    # normalize the counts along the rows to get robabilities
    bigram_probs /= bigram_probs.sum(
        axis=1, keepdims=True
    )  # sabere svaki row(dobije se matrica [V,1]) i podeli taj row sa sumom koja odgovara tom row-u

    return bigram_probs


if __name__ == '__main__':
    # load in the data
    # note: sentences are already converted to sequences of word indexes
    # note: you can limit the vocab size if you run out of memory
    sentences, word2idx = get_sentences_with_word2idx_limit_vocab(
        10000
    )  # list of sentences [11,22,33] as numbers + word2idx {'lala':1, ...} map like this
    # sentences, word2idx = get_sentences_with_word2idx()

    # vocab size
    V = len(word2idx)
    print("Vocab size:", V)

    # we will also treat beginning of sentence and end of sentence as bigrams
    # START -> first word
    # last word -> END
    start_idx = word2idx['START']
    end_idx = word2idx['END']

    # a matrix where:
    # row = last word