def wikipedia():
    sentences, word2idx = get_wikipedia_data()
    print("finished retrieving data")
    print("vocab size:", len(word2idx), "number of sentences:", len(sentences))
    rnn = SimpleRNN(20, 15, len(word2idx))
    rnn.fit(sentences,
            learning_rate=10e-5,
            show_fig=True,
            activation=T.nnet.relu)
Ejemplo n.º 2
0
def main():
    # sentences, word2idx= get_sentences_with_word2idx_limit_vocab(n_vocab=2000)
    sentences, word2idx = get_wikipedia_data(n_files=50, n_vocab=2000)
    with open('w2v_word2idx.json', 'w') as f:
        json.dump(word2idx, f)

    V = len(word2idx)
    model = Model(80, V, 10)
    model.fit(sentences, learning_rate=10e-4, mu=0, epochs=7)
    model.save('w2v_model.npz')
Ejemplo n.º 3
0
def train_wikipedia(we_file='word_embeddings.npy', w2i_file='wikipedia_word2idx.json', RecurrentUnit=GRU):
    # there are 32 files
    sentences, word2idx = get_wikipedia_data(n_files=1, n_vocab=2000)
    print "finished retrieving data"
    print "vocab size:", len(word2idx), "number of sentences:", len(sentences)
    rnn = RNN(30, [30], len(word2idx))
    rnn.fit(sentences, learning_rate=10e-6, epochs=10, show_fig=True, activation=T.nnet.relu)

    np.save(we_file, rnn.We.get_value())
    with open(w2i_file, 'w') as f:
        json.dump(word2idx, f)
Ejemplo n.º 4
0
def train_wikipedia(we_file='word_embeddings.npy', w2i_file='wikipedia_word2idx.json', RecurrentUnit=GRU):
	# there are 32 files
	sentences, word2idx = get_wikipedia_data(n_files=1, n_vocab=2000)
	print("finished retrieving data")
	print("vocab size:", len(word2idx), "number of sentences:", len(sentences))
	rnn = RNN(30, [30], len(word2idx))
	rnn.fit(sentences, learning_rate=10e-6, epochs=10, show_fig=True, activation=T.nnet.relu)

	np.save(we_file, rnn.We.get_value())
	with open(w2i_file, 'w') as f:
		json.dump(word2idx, f)
def train_wikipedia(we_file='lstm_word_embeddings.npy', w2i_file='lstm_wikipedia_word2idx.json'):
    # there are 32 files
    sentences, word2idx = get_wikipedia_data(n_files=100, n_vocab=2000)
    print "finished retrieving data"
    print "vocab size:", len(word2idx), "number of sentences:", len(sentences)
    rnn = RNN(50, [50], len(word2idx))
    # todo: next try increas LR
    rnn.fit(sentences, learning_rate=10e-6, epochs=10, show_fig=True, activation=T.nnet.relu, RecurrentUnit=LSTM, normalize=False)

    np.save(we_file, rnn.We.get_value())
    with open(w2i_file, 'w') as f:
        json.dump(word2idx, f)
Ejemplo n.º 6
0
def train_wikipedia(we_file='word_embeddings.npy',
                    w2i_file='wikipedia_word2idx.json',
                    RecurrentUnit=GRU):
    sentences, word2idx = get_wikipedia_data(n_files=100, n_vocab=2000)
    print('Finished retrieving data!')
    print(
        f'vocab size: {len(word2idx)}, number of sentences: {len(sentences)}')
    rnn = RNN(50, [50], len(word2idx))
    rnn.fit(sentences,
            learning_rate=10e-6,
            epochs=10,
            show_fig=True,
            activation=T.nnet.relu)

    np.savez(we_file, rnn.We.get_value())
    with open(w2i_file, 'w') as f:
        json.dump(word2idx, f)
Ejemplo n.º 7
0
def train_wikipedia(we_file='lstm_word_embeddings.npy',
                    w2i_file='lstm_wikipedia_word2idx.json'):
    # there are 32 files
    sentences, word2idx = get_wikipedia_data(n_files=100, n_vocab=2000)
    print "finished retrieving data"
    print "vocab size:", len(word2idx), "number of sentences:", len(sentences)
    rnn = RNN(50, [50], len(word2idx))
    # todo: next try increas LR
    rnn.fit(sentences,
            learning_rate=10e-6,
            epochs=10,
            show_fig=True,
            activation=T.nnet.relu,
            RecurrentUnit=LSTM,
            normalize=False)

    np.save(we_file, rnn.We.get_value())
    with open(w2i_file, 'w') as f:
        json.dump(word2idx, f)
Ejemplo n.º 8
0
def train_wikipedia(we_file='word_embeddings.npy',
                    w2i_file='wikipedia_word2idx.json',
                    RecurrentUnit=GRU):
    # there are 32 files
    # note: you can pick between Wikipedia data and Brown corpus
    # just comment one out, and uncomment the other!
    sentences, word2idx = get_wikipedia_data(n_files=98, n_vocab=2000)
    # sentences, word2idx = get_sentences_with_word2idx_limit_vocab()

    print("finished retrieving data")
    print("vocab size:", len(word2idx), "number of sentences:", len(sentences))
    rnn = RNN(30, [30], len(word2idx))
    rnn.fit(sentences,
            learning_rate=1e-5,
            epochs=10,
            show_fig=True,
            activation=T.nnet.relu)

    np.save(we_file, rnn.We.get_value())
    with open(w2i_file, 'w') as f:
        json.dump(word2idx, f)
Ejemplo n.º 9
0
def main(we_file, w2i_file, n_files=50):
    cc_matrix = "cc_matrix_%s.npy" % n_files

    if os.path.exists(cc_matrix):
        with open(w2i_file) as f:
            word2idx = json.load(f)
        sentences = []
    else:
        sentences, word2idx = get_wikipedia_data(n_files=n_files, n_vocab=2000)
        with open(w2i_file, 'w') as f:
            json.dump(word2idx, f)

    V = len(word2idx)
    model = Glove(80, V, 10)
    model.fit(sentences=sentences,
              cc_matrix=cc_matrix,
              learning_rate=3 * 10e-5,
              reg=0.01,
              epoches=2000,
              gd=True,
              use_theano=False)
    model.save(we_file)
def wikipedia():
    sentences, word2idx = get_wikipedia_data()
    print "finished retrieving data"
    print "vocab size:", len(word2idx), "number of sentences:", len(sentences)
    rnn = SimpleRNN(20, 15, len(word2idx))
    rnn.fit(sentences, learning_rate=10e-5, show_fig=True, activation=T.nnet.relu)
Ejemplo n.º 11
0
def main(we_file, w2i_file, use_brown=True, n_files=100):
    if use_brown:
        cc_matrix = "cc_matrix_brown.npy"
    else:
        cc_matrix = "cc_matrix_%s.npy" % n_files

    # hacky way of checking if we need to re-load the raw data or not
    # remember, only the co-occurrence matrix is needed for training
    if os.path.exists(cc_matrix):
        with open(w2i_file) as f:
            word2idx = json.load(f)
        sentences = []  # dummy - we won't actually use it
    else:
        if use_brown:
            keep_words = set([
                'king',
                'man',
                'woman',
                'france',
                'paris',
                'london',
                'rome',
                'italy',
                'britain',
                'england',
                'french',
                'english',
                'japan',
                'japanese',
                'chinese',
                'italian',
                'australia',
                'australian',
                'december',
                'november',
                'june',
                'january',
                'february',
                'march',
                'april',
                'may',
                'july',
                'august',
                'september',
                'october',
            ])
            sentences, word2idx = get_sentences_with_word2idx_limit_vocab(
                n_vocab=5000, keep_words=keep_words)
        else:
            sentences, word2idx = get_wikipedia_data(n_files=n_files,
                                                     n_vocab=2000)

        with open(w2i_file, 'w') as f:
            json.dump(word2idx, f)

    V = len(word2idx)
    model = Glove(100, V, 10)

    # alternating least squares method
    model.fit(sentences, cc_matrix=cc_matrix)
    model.save(we_file)
Ejemplo n.º 12
0
from sklearn.decomposition import TruncatedSVD, PCA, KernelPCA
from sklearn.feature_extraction.text import TfidfTransformer
from datetime import datetime
import os
import sys

initial_dir = os.getcwd()
os.chdir('../course_repo/machine_learning_examples/rnn_class')

from util import get_wikipedia_data
from brown import get_sentences_with_word2idx_limit_vocab, get_sentences_with_word2idx

# os.chdir(initial_dir)

sentences, word2idx = get_wikipedia_data(n_files=3,
                                         n_vocab=2000,
                                         by_paragraph=True)

# build term document matrix:
V = len(word2idx)
N = len(sentences)

# create raw counts:
A = np.zeros((V, N))

j = 0
for sentence in sentences:
    for i in sentence:
        A[i, j] += 1
    j += 1
def wikipedia():
    sentences, word2idx = get_wikipedia_data()