def main(we_file, w2i_file, use_brown=True, n_files=50): if use_brown: cc_matrix = "cc_matrix_brown.npy" else: cc_matrix = "cc_matrix_%s.npy" % n_files # hacky way of checking if we need to re-load the raw data or not # remember, only the co-occurrence matrix is needed for training if os.path.exists(cc_matrix): with open(w2i_file) as f: word2idx = json.load(f) sentences = [] # dummy - we won't actually use it else: if use_brown: keep_words = set([ 'king', 'man', 'woman', 'france', 'paris', 'london', 'rome', 'italy', 'britain', 'england', 'french', 'english', 'japan', 'japanese', 'chinese', 'italian', 'australia', 'australian', 'december', 'november', 'june', 'january', 'february', 'march', 'april', 'may', 'july', 'august', 'september', 'october', ]) sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=5000, keep_words=keep_words) else: sentences, word2idx = get_wikipedia_data(n_files=n_files, n_vocab=2000) with open(w2i_file, 'w') as f: json.dump(word2idx, f) V = len(word2idx) model = Glove(100, V, 10) model.fit(sentences, cc_matrix=cc_matrix, epochs=200) model.save(we_file)
def main(we_file, w2i_file, use_brown=True, n_files=100): if use_brown: cc_matrix = "cc_matrix_brown.npy" else: cc_matrix = "cc_matrix_%s.npy" % n_files # hacky way of checking if we need to re-load the raw data or not # remember, only the co-occurrence matrix is needed for training if os.path.exists(cc_matrix): with open(w2i_file) as f: word2idx = json.load(f) sentences = [] # dummy - we won't actually use it else: if use_brown: keep_words = set([ 'king', 'man', 'woman', 'france', 'paris', 'london', 'rome', 'italy', 'britain', 'england', 'french', 'english', 'japan', 'japanese', 'chinese', 'italian', 'australia', 'australian', 'december', 'november', 'june', 'january', 'february', 'march', 'april', 'may', 'july', 'august', 'september', 'october', ]) sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=5000, keep_words=keep_words) else: sentences, word2idx = get_wikipedia_data(n_files=n_files, n_vocab=2000) with open(w2i_file, 'w') as f: json.dump(word2idx, f) V = len(word2idx) model = Glove(100, V, 10) # alternating least squares method model.fit(sentences, cc_matrix=cc_matrix) model.save(we_file)
def main(): sentences, word2idx = get_sentences_with_word2idx_limit_vocab(2000) V = len(word2idx) print("Vocab size:", V) start_idx = word2idx['START'] end_idx = word2idx['END'] estimator = SoftmaxANN(K=V, start_idx=start_idx, end_idx=end_idx) estimator.fit(sentences, D=V, epochs=1, lr=0.1, freq=50, is_sent=True)
def main(): sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=1500) # sentences, word2idx = get_wikipedia_data(n_files=10, n_vocab=1500, by_paragraph=True) with open('w2v_word2idx.json', 'w') as f: json.dump(word2idx, f) # build term document matrix V = len(word2idx) N = len(sentences) # create raw counts first A = np.zeros((V, N)) j = 0 for sentence in sentences: for i in sentence: A[i, j] += 1 j += 1 print("finished getting raw counts") transformer = TfidfTransformer() A = transformer.fit_transform(A) # print "type(A):", type(A) # exit() A = A.toarray() idx2word = {v: k for k, v in word2idx.iteritems()} # plot the data in 2-D tsne = TSNE() Z = tsne.fit_transform(A) plt.scatter(Z[:, 0], Z[:, 1]) for i in range(V): try: plt.annotate(s=idx2word[i].encode("utf8"), xy=(Z[i, 0], Z[i, 1])) except: print("bad string:", idx2word[i]) plt.show() # create a higher-D word embedding, try word analogies # tsne = TSNE(n_components=3) # We = tsne.fit_transform(A) We = Z find_analogies('king', 'man', 'woman', We, word2idx) find_analogies('france', 'paris', 'london', We, word2idx) find_analogies('france', 'paris', 'rome', We, word2idx) find_analogies('paris', 'france', 'italy', We, word2idx)
def main(use_brown=True): if use_brown: sentences, word2idx = get_sentences_with_word2idx_limit_vocab() # sentences, word2idx = get_sentences_with_word2idx() # sentences, word2idx = get_text8() else: sentences, word2idx = get_wikipedia_data(n_files=1, n_vocab=2000) with open('w2v_word2idx.json', 'w') as f: json.dump(word2idx, f) V = len(word2idx) model = Model(50, V, 5) # use numpy # model.fit(sentences, learning_rate=1e-3, mu=0, epochs=5, num_neg_samples=5) # use theano model.fitt(sentences, learning_rate=1e-3, mu=0, epochs=5, num_neg_samples=5) model.save('w2v_model.npz')
TF-IDF是NLP中用於將文本文檔列表轉換為矩陣表示的常用工具。 每個文檔被轉換為TF-IDF矩陣的row, 並且每個詞存儲在column中。詞彙量(或欄位數)的大小是應該指定的參數, 前5'000-10'000最常見詞的詞彙通常就足夠了。 TF-IDF是稀疏向量,其中向量表示中的非零值的數量總是等於文檔中的唯一單詞的數量。 在擬合期間,tf-idf函數發現語料庫中最常見的單詞並將其保存到詞彙表中。 通過計算詞彙表中每個單詞出現在文檔中的次數來轉換文檔。 因此,tf-idf矩陣將具有[Number_documents,Size_of_vocabulary]的形狀。 每個單詞的權重通過它在語料庫中出現的次數進行歸一化。 ''' ### choose a data source ### # get corpus(Brown) and word2idx sentences, word2idx = get_sentences_with_word2idx_limit_vocab() print('finished retriveing data') print('vocab size:', len(word2idx), 'number of sentences:', len(sentences)) # print(sentences[15]) # 隨便抓一個句子看看 -> [13, 2000, 74, 48, 1578, 26, 19, 2000, 1276, 16, 91, 1143, 160, 2000, 38, 248, 20, 102, 218, 2000, 2000, 27, 15] ## build term document matrix,就像是CountVectorizer()做的事 V = len(word2idx) N = len(sentences) A = np.zeros((N, V)) # [Number_documents,Size_of_vocabulary] print('N:', N, 'V:', V) for i in range(N): for j in sentences[i]: A[i, j] += 1 print('finished build term document matrix A, A.shape:', A.shape)
from datetime import datetime import os import sys sys.path.append(os.path.abspath('..')) from rnn_class.util import get_wikipedia_data from rnn_class.brown import get_sentences_with_word2idx_limit_vocab, get_sentences_with_word2idx from markov import get_bigram_probs if __name__ == '__main__': # load in the data # note: sentences are already converted to sequences of word indexes # note: you can limit the vocab size if you run out of memory sentences, word2idx = get_sentences_with_word2idx_limit_vocab(2000) # sentences, word2idx = get_sentences_with_word2idx() # vocab size V = len(word2idx) print("Vocab size:", V) # we will also treat beginning of sentence and end of sentence as bigrams # START -> first word # last word -> END start_idx = word2idx['START'] end_idx = word2idx['END'] # a matrix where: # row = last word
def main(we_file, w2i_file, use_brown=True, n_files=100): if use_brown: cc_matrix = 'cc_matrix_brown.npy' else: cc_matrix = 'cc_matrix_%s.npy' % n_files # hacky way of checking if we need to re-load the raw data or not # remember only the co-occurrence matirx is needed for training if os.path.exists(cc_matrix): with open(w2i_file) as f: word2idx = json.load(f) # 載入就變成 dict 型別囉 sentences = [] # dummy - we won't actually use it else: if use_brown: keep_words = set([ 'king', 'man', 'woman', 'france', 'paris', 'london', 'rome', 'italy', 'britain', 'england', 'french', 'english', 'japan', 'japanese', 'chinese', 'italian', 'australia', 'australian', 'december', 'november', 'june', 'january', 'february', 'march', 'april', 'may', 'july', 'august', 'september', 'october', ]) sentences, word2idx = get_sentences_with_word2idx_limit_vocab( n_vocab=5000, keep_words=keep_words) else: # sentences, word2idx = get_wikipedia_data(n_files=n_files, n_vocab=2000) # 我根本就不會用到這個 pass with open(w2i_file, 'w') as f: json.dump(word2idx, f) V = len(word2idx) model = Glove(80, V, 10) # ALS method model.fit(sentences=sentences, cc_matrix=cc_matrix, epochs=20) # gradient descent method # model.fit( # sentences, # cc_matrix=cc_matrix, # learning_rate=5e-4, # reg=0.1, # epochs=500, # 注意,要500次才夠耶!! # gd=True, # ) model.save(we_file)
bigram_probs[sentence[i], end_idx] += 1 # normalize the counts along the rows to get robabilities bigram_probs /= bigram_probs.sum( axis=1, keepdims=True ) # sabere svaki row(dobije se matrica [V,1]) i podeli taj row sa sumom koja odgovara tom row-u return bigram_probs if __name__ == '__main__': # load in the data # note: sentences are already converted to sequences of word indexes # note: you can limit the vocab size if you run out of memory sentences, word2idx = get_sentences_with_word2idx_limit_vocab( 10000 ) # list of sentences [11,22,33] as numbers + word2idx {'lala':1, ...} map like this # sentences, word2idx = get_sentences_with_word2idx() # vocab size V = len(word2idx) print("Vocab size:", V) # we will also treat beginning of sentence and end of sentence as bigrams # START -> first word # last word -> END start_idx = word2idx['START'] end_idx = word2idx['END'] # a matrix where: # row = last word