def train_nn(params): print(params) np.random.seed(1) # for reproducibility corpus_train = lm.readCorpus("data/train.txt") corpus_dev = lm.readCorpus("data/dev.txt") corpus_test = lm.readCorpus("data/test.txt") # build a common index (words to integers), mapping rare words (less than 5 occurences) to index 0 # nwords = vocabulary size for the models that only see the indexes w2index, nwords = lm.buildIndex(corpus_train + corpus_dev + corpus_test) # find words that appear in the training set so we can deal with new words separately count_train = np.zeros((nwords,)) for snt in corpus_train: for w in snt: count_train[w2index[w]] += 1 # Network model #print("\nNetwork model training:") n = params[0] # Length of n-gram dim = params[1] # Word vector dimension hdim = params[2] # Hidden units neurallm = lm.neuralLM(dim, n, hdim, nwords) # The network model ngrams = lm.ngramGen(corpus_train, w2index, n) ngrams2 = lm.ngramGen(corpus_dev, w2index, n) lrate = 0.5 # Learning rate best_LL = float('-inf') it = 0 while True: it += 1 LL, N = 0.0, 0 # Average log-likelihood, number of ngrams for ng in ngrams: pr = neurallm.update(ng, lrate) LL += np.log(pr) N += 1 #print('Train:\t{0}\tLL = {1}'.format(it, LL / N)) #Dev set LL, N = 0.0, 0 # Average log-likelihood, number of ngrams for ng in ngrams2: if (count_train[ng[-1]]>0): # for now, skip target words not seen in training pr = neurallm.prob(ng) LL += np.log(pr) N += 1 if LL / N > best_LL: best_LL = LL / N else: break return_result = {(params[0], params[1], params[2],): (it, best_LL)} with open('data/{}_{}_{}.pkl'.format(params[0], params[1], params[2]), 'w') as f: f.write(cPickle.dumps(return_result))
def train_nn(): np.random.seed(1) # for reproducibility corpus_train = lm.readCorpus("data/train.txt") corpus_dev = lm.readCorpus("data/dev.txt") corpus_test = lm.readCorpus("data/test.txt") test_1 = lm.readCorpus("data/test_1.txt") test_2 = lm.readCorpus("data/test_2.txt") # build a common index (words to integers), mapping rare words (less than 5 occurences) to index 0 # nwords = vocabulary size for the models that only see the indexes w2index, nwords = lm.buildIndex(corpus_train + corpus_dev + corpus_test + test_1 + test_2) # find words that appear in the training set so we can deal with new words separately count_train = np.zeros((nwords,)) for snt in corpus_train: for w in snt: count_train[w2index[w]] += 1 # Network model #print("\nNetwork model training:") n = 3 # Length of n-gram dim = 14 # Word vector dimension hdim = 38 # Hidden units neurallm = lm.neuralLM(dim, n, hdim, nwords) # The network model ngrams = lm.ngramGen(corpus_train, w2index, n) ngrams_1 = lm.ngramGen(test_1, w2index, n) ngrams_2 = lm.ngramGen(test_2, w2index, n) lrate = 0.5 # Learning rate for it in xrange(8): # passes through the training data LL, N = 0.0, 0 # Average log-likelihood, number of ngrams for ng in ngrams: pr = neurallm.update(ng, lrate) LL += np.log(pr) N += 1 print('Train:\t{0}\tLL = {1}'.format(it, LL / N)) #Dev set LL, N = 0.0, 0 # Average log-likelihood, number of ngrams for ng in ngrams_1: if (count_train[ng[-1]]>0): # for now, skip target words not seen in training pr = neurallm.prob(ng) LL += np.log(pr) N += 1 print('Test_1:\t{0}\tLL = {1}'.format(it, LL / N)) #Dev set LL, N = 0.0, 0 # Average log-likelihood, number of ngrams for ng in ngrams_2: if (count_train[ng[-1]]>0): # for now, skip target words not seen in training pr = neurallm.prob(ng) LL += np.log(pr) N += 1 print('Test_2:\t{0}\tLL = {1}'.format(it, LL / N))
from __future__ import print_function from __future__ import division import numpy as np import languagemodel as lm np.random.seed(1) # for reproducibility corpus_train = lm.readCorpus("data/train.txt") corpus_dev = lm.readCorpus("data/dev.txt") corpus_test = lm.readCorpus("data/test.txt") # build a common index (words to integers), mapping rare words (less than 5 occurences) to index 0 # nwords = vocabulary size for the models that only see the indexes w2index, nwords = lm.buildIndex(corpus_train + corpus_dev + corpus_test) # find words that appear in the training set so we can deal with new words separately count_train = np.zeros((nwords,)) for snt in corpus_train: for w in snt: count_train[w2index[w]] += 1 # Bi-gram model as a baseline alpha = 0.1 # add-alpha smoothing probB = lm.bigramLM(corpus_train, w2index, nwords, alpha) LLB, N = 0.0, 0 bi = lm.ngramGen(corpus_dev, w2index, 2) for w in bi: if count_train[w[1]] > 0: # for now, skip target words not seen in training LLB += np.log(probB[w[0], w[1]])