def wikipedia(): sentences, word2idx = get_wikipedia_data() print("finished retrieving data") print("vocab size:", len(word2idx), "number of sentences:", len(sentences)) rnn = SimpleRNN(20, 15, len(word2idx)) rnn.fit(sentences, learning_rate=10e-5, show_fig=True, activation=T.nnet.relu)
def main(): # sentences, word2idx= get_sentences_with_word2idx_limit_vocab(n_vocab=2000) sentences, word2idx = get_wikipedia_data(n_files=50, n_vocab=2000) with open('w2v_word2idx.json', 'w') as f: json.dump(word2idx, f) V = len(word2idx) model = Model(80, V, 10) model.fit(sentences, learning_rate=10e-4, mu=0, epochs=7) model.save('w2v_model.npz')
def train_wikipedia(we_file='word_embeddings.npy', w2i_file='wikipedia_word2idx.json', RecurrentUnit=GRU): # there are 32 files sentences, word2idx = get_wikipedia_data(n_files=1, n_vocab=2000) print "finished retrieving data" print "vocab size:", len(word2idx), "number of sentences:", len(sentences) rnn = RNN(30, [30], len(word2idx)) rnn.fit(sentences, learning_rate=10e-6, epochs=10, show_fig=True, activation=T.nnet.relu) np.save(we_file, rnn.We.get_value()) with open(w2i_file, 'w') as f: json.dump(word2idx, f)
def train_wikipedia(we_file='word_embeddings.npy', w2i_file='wikipedia_word2idx.json', RecurrentUnit=GRU): # there are 32 files sentences, word2idx = get_wikipedia_data(n_files=1, n_vocab=2000) print("finished retrieving data") print("vocab size:", len(word2idx), "number of sentences:", len(sentences)) rnn = RNN(30, [30], len(word2idx)) rnn.fit(sentences, learning_rate=10e-6, epochs=10, show_fig=True, activation=T.nnet.relu) np.save(we_file, rnn.We.get_value()) with open(w2i_file, 'w') as f: json.dump(word2idx, f)
def train_wikipedia(we_file='lstm_word_embeddings.npy', w2i_file='lstm_wikipedia_word2idx.json'): # there are 32 files sentences, word2idx = get_wikipedia_data(n_files=100, n_vocab=2000) print "finished retrieving data" print "vocab size:", len(word2idx), "number of sentences:", len(sentences) rnn = RNN(50, [50], len(word2idx)) # todo: next try increas LR rnn.fit(sentences, learning_rate=10e-6, epochs=10, show_fig=True, activation=T.nnet.relu, RecurrentUnit=LSTM, normalize=False) np.save(we_file, rnn.We.get_value()) with open(w2i_file, 'w') as f: json.dump(word2idx, f)
def train_wikipedia(we_file='word_embeddings.npy', w2i_file='wikipedia_word2idx.json', RecurrentUnit=GRU): sentences, word2idx = get_wikipedia_data(n_files=100, n_vocab=2000) print('Finished retrieving data!') print( f'vocab size: {len(word2idx)}, number of sentences: {len(sentences)}') rnn = RNN(50, [50], len(word2idx)) rnn.fit(sentences, learning_rate=10e-6, epochs=10, show_fig=True, activation=T.nnet.relu) np.savez(we_file, rnn.We.get_value()) with open(w2i_file, 'w') as f: json.dump(word2idx, f)
def train_wikipedia(we_file='word_embeddings.npy', w2i_file='wikipedia_word2idx.json', RecurrentUnit=GRU): # there are 32 files # note: you can pick between Wikipedia data and Brown corpus # just comment one out, and uncomment the other! sentences, word2idx = get_wikipedia_data(n_files=98, n_vocab=2000) # sentences, word2idx = get_sentences_with_word2idx_limit_vocab() print("finished retrieving data") print("vocab size:", len(word2idx), "number of sentences:", len(sentences)) rnn = RNN(30, [30], len(word2idx)) rnn.fit(sentences, learning_rate=1e-5, epochs=10, show_fig=True, activation=T.nnet.relu) np.save(we_file, rnn.We.get_value()) with open(w2i_file, 'w') as f: json.dump(word2idx, f)
def main(we_file, w2i_file, n_files=50): cc_matrix = "cc_matrix_%s.npy" % n_files if os.path.exists(cc_matrix): with open(w2i_file) as f: word2idx = json.load(f) sentences = [] else: sentences, word2idx = get_wikipedia_data(n_files=n_files, n_vocab=2000) with open(w2i_file, 'w') as f: json.dump(word2idx, f) V = len(word2idx) model = Glove(80, V, 10) model.fit(sentences=sentences, cc_matrix=cc_matrix, learning_rate=3 * 10e-5, reg=0.01, epoches=2000, gd=True, use_theano=False) model.save(we_file)
def wikipedia(): sentences, word2idx = get_wikipedia_data() print "finished retrieving data" print "vocab size:", len(word2idx), "number of sentences:", len(sentences) rnn = SimpleRNN(20, 15, len(word2idx)) rnn.fit(sentences, learning_rate=10e-5, show_fig=True, activation=T.nnet.relu)
def main(we_file, w2i_file, use_brown=True, n_files=100): if use_brown: cc_matrix = "cc_matrix_brown.npy" else: cc_matrix = "cc_matrix_%s.npy" % n_files # hacky way of checking if we need to re-load the raw data or not # remember, only the co-occurrence matrix is needed for training if os.path.exists(cc_matrix): with open(w2i_file) as f: word2idx = json.load(f) sentences = [] # dummy - we won't actually use it else: if use_brown: keep_words = set([ 'king', 'man', 'woman', 'france', 'paris', 'london', 'rome', 'italy', 'britain', 'england', 'french', 'english', 'japan', 'japanese', 'chinese', 'italian', 'australia', 'australian', 'december', 'november', 'june', 'january', 'february', 'march', 'april', 'may', 'july', 'august', 'september', 'october', ]) sentences, word2idx = get_sentences_with_word2idx_limit_vocab( n_vocab=5000, keep_words=keep_words) else: sentences, word2idx = get_wikipedia_data(n_files=n_files, n_vocab=2000) with open(w2i_file, 'w') as f: json.dump(word2idx, f) V = len(word2idx) model = Glove(100, V, 10) # alternating least squares method model.fit(sentences, cc_matrix=cc_matrix) model.save(we_file)
from sklearn.decomposition import TruncatedSVD, PCA, KernelPCA from sklearn.feature_extraction.text import TfidfTransformer from datetime import datetime import os import sys initial_dir = os.getcwd() os.chdir('../course_repo/machine_learning_examples/rnn_class') from util import get_wikipedia_data from brown import get_sentences_with_word2idx_limit_vocab, get_sentences_with_word2idx # os.chdir(initial_dir) sentences, word2idx = get_wikipedia_data(n_files=3, n_vocab=2000, by_paragraph=True) # build term document matrix: V = len(word2idx) N = len(sentences) # create raw counts: A = np.zeros((V, N)) j = 0 for sentence in sentences: for i in sentence: A[i, j] += 1 j += 1
def wikipedia(): sentences, word2idx = get_wikipedia_data()