Beispiel #1
0
	def testTrain(self):
		global model
		global embed_map
		main.save_vocab('test_dir', 'test_dir/dict.dat')
		self.assertTrue(os.path.exists('test_dir/dict.dat'))
		
		embed_map = tools.load_googlenews_vectors('word2vec.w2v', binary = True)
		train.trainer(list(main.load_corpus('test_dir')), saveto = 'test_dir/model', saveFreq = 10, n_words = 10) #you may want to change parameters saveFreq or n_words if you use other test corpus texts
		os.rename('test_dir/model.pkl', 'test_dir/model.npz.pkl')
		self.assertTrue(os.path.exists('test_dir/model.npz'))
		self.assertTrue(os.path.exists('test_dir/model.npz.pkl'))
		
		model = tools.load_model('test_dir/model.npz', 'test_dir/dict.dat', 'word2vec.w2v', embed_map)
		X_train, y_train = main.training_set(model, ['test_dir/train.csv'])
		
		self.assertEqual(len(X_train.shape), 2)
		self.assertEqual(len(y_train.shape), 1)
		self.assertEqual(X_train.shape[0], y_train.shape[0])
		self.assertEqual(X_train.shape[1], 4800)
import tools
import eval_sick

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

if __name__ == '__main__':
    embed_map = tools.load_googlenews_vectors()
    model = tools.load_model(embed_map)

    eval_sick.evaluate(model, evaltest=True)
Beispiel #3
0
	for item in parse[2].split():
	    f = item.split(':')
	    d[int(f[0])] = float(f[1])
	bowfeats.append(d)

# model
# load entities
#labels = []
#ent = []
#with open(data_path, 'r') as f:
#    for line in f:
#	labels.append(line.split('\t')[1][:-1])
#	ent.append(line.split('\t')[0])
if not reload_vectors:
    if not reload_model:
	embed_map = tools.load_googlenews_vectors()
	model = tools.load_model(embed_map)
	with open(model_save_path, 'w') as f:
	    pkl.dump(model, f)
    else:
	model = pkl.load(open(model_save_path, 'r'))

    # process
    xp = []
    for entity in ent:
	#line = re.sub(r'[0-9]+','#'," ".join([word for word in entity.split('_') if word not in stops])).lower()
	line = " ".join([word for word in entity.split('_') if word not in stops]).lower()
	xp.append(filter(lambda x: x in string.printable, line))

    # encode new sentences
    vectors = tools.encode(model, xp)
    with open(f) as file_descriptor:
        file_content = file_descriptor.read().decode("utf-8", "ignore")
        file_content = sent_tokenize(file_content)
        for sentence in file_content:
            if sentence:
                X.append(sentence.strip())

sentence_embeddings = np.empty([file_counter, 4800])
loc = base_path_to_directory + "dictionary.pkl"
saveto = base_path_to_directory + "toy.npz"
maxlen_w = 70
worddict, wordcount = vocab.build_dictionary(X)
vocab.save_dictionary(worddict, wordcount,
                      loc)  #loc where you want to save dictionary
#in train.py set 1>path for dictionary, 2>save_to -path where to save model 3>maxlen_w
train.trainer(X, dictionary=loc, saveto=saveto, maxlen_w=maxlen_w)

#In tools.py set path_to_model=save_to in train, path_to_dictionary=dictionary in train and path_to_word2vec.
embed_map = tools.load_googlenews_vectors(path_to_word2vec)
model = tools.load_model(embed_map)
if not os.path.exists(SENTENCE_EMBEDDING_FOLDER):
    os.mkdir(SENTENCE_EMBEDDING_FOLDER)

for f in FILES:
    with open(f) as file_descriptor:
        file_content = sent_tokenize(file_descriptor.read())
        document_embedding = tools.encode(model, file_content, verbose=False)
        document_embedding = np.average(document_embedding, axis=0)
        file_name = f.split('/')[-1]
        np.save(SENTENCE_EMBEDDING_FOLDER + file_name[:-4], document_embedding)
Beispiel #5
0
def trainer(
        X,
        dim_word=620,  # word vector dimensionality
        dim=2400,  # the number of GRU units
        encoder='gru',
        decoder='gru',
        max_epochs=5,
        dispFreq=1,
        decay_c=0.,
        grad_clip=5.,
        n_words=20000,
        maxlen_w=30,
        optimizer='adam',
        batch_size=512,
        saveto='/u/rkiros/research/semhash/models/toy.npz',
        dictionary='/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl',
        saveFreq=5000,
        reload_=False,
        reload_path='output_books_full/model_ae_full_bsz_64_iter_313000.npz',
        SICK_eval=False):

    # Model options
    model_options = {}
    model_options['dim_word'] = dim_word
    model_options['dim'] = dim
    model_options['encoder'] = encoder
    model_options['decoder'] = decoder
    model_options['max_epochs'] = max_epochs
    model_options['dispFreq'] = dispFreq
    model_options['decay_c'] = decay_c
    model_options['grad_clip'] = grad_clip
    model_options['n_words'] = n_words
    model_options['maxlen_w'] = maxlen_w
    model_options['optimizer'] = optimizer
    model_options['batch_size'] = batch_size
    model_options['saveto'] = saveto
    model_options['dictionary'] = dictionary
    model_options['saveFreq'] = saveFreq
    model_options['reload_'] = reload_
    model_options['reload_path'] = reload_path

    print model_options

    # reload options
    if reload_ and os.path.exists(reload_path):
        print 'reloading...' + reload_path
        with open('%s.pkl' % reload_path, 'rb') as f:
            models_options = pkl.load(f)

        reload_idx = int(reload_path.split('_')[-1].split('.')[0])

    # load dictionary
    print 'Loading dictionary...'
    worddict = load_dictionary(dictionary)

    # Inverse dictionary
    word_idict = dict()
    for kk, vv in worddict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    print 'Building model'
    params = init_params(model_options)
    # reload parameters
    if reload_ and os.path.exists(reload_path):
        params = load_params(reload_path, params)

    tparams = init_tparams(params)

    trng, x, x_mask, y, y_mask, z, z_mask, \
          opt_ret, \
          cost = \
          build_model(tparams, model_options)
    inps = [x, x_mask, y, y_mask, z, z_mask]

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=False)
    print 'Done'

    # weight decay, if applicable
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # after any regularizer
    print 'Building f_cost...',
    f_cost = theano.function(inps, cost, profile=False)
    print 'Done'

    print 'Done'
    print 'Building f_grad...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    f_grad_norm = theano.function(inps, [(g**2).sum() for g in grads],
                                  profile=False)
    f_weight_norm = theano.function([], [(t**2).sum()
                                         for k, t in tparams.iteritems()],
                                    profile=False)

    if grad_clip > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g**2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(
                tensor.switch(g2 > (grad_clip**2),
                              g / tensor.sqrt(g2) * grad_clip, g))
        grads = new_grads

    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    # (compute gradients), (updates parameters)
    f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)

    print 'Optimization'

    # Each sentence in the minibatch have same length (for encoder)
    trainX = homogeneous_data.grouper(X)
    train_iter = homogeneous_data.HomogeneousData(trainX,
                                                  batch_size=batch_size,
                                                  maxlen=maxlen_w)

    if not reload_:
        uidx = 0
    else:
        uidx = reload_idx
    lrate = 0.01
    for eidx in xrange(max_epochs):
        n_samples = 0

        print 'Epoch ', eidx

        for x, y, z in train_iter:
            n_samples += len(x)
            uidx += 1

            x, x_mask, y, y_mask, z, z_mask = homogeneous_data.prepare_data(
                x, y, z, worddict, maxlen=maxlen_w, n_words=n_words)

            if x == None:
                print 'Minibatch with zero sample under length ', maxlen_w
                uidx -= 1
                continue

            ud_start = time.time()
            cost = f_grad_shared(x, x_mask, y, y_mask, z, z_mask)
            f_update(lrate)
            ud = time.time() - ud_start

            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                return 1., 1., 1.

            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud

            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving...',

                saveto_iternum = saveto.format(uidx)

                params = unzip(tparams)
                numpy.savez(saveto_iternum, history_errs=[], **params)
                pkl.dump(model_options, open('%s.pkl' % saveto_iternum, 'wb'))
                print 'Done'

                if SICK_eval:
                    print "Evaluating SICK Test performance"
                    embed_map = tools.load_googlenews_vectors()
                    model = tools.load_model(path_to_model=saveto_iternum,
                                             embed_map=embed_map)
                    yhat, pr, sr, mse = eval_sick.evaluate(model,
                                                           evaltest=True)

                    del (model)
                    del (embed_map)
                    print pr, sr, mse

                    res_save_file = saveto.format('ALL').split(
                        '.')[0] + '_SICK_EVAL.txt'
                    with open(res_save_file, 'a') as rsf:
                        cur_time = strftime("%a, %d %b %Y %H:%M:%S +0000",
                                            gmtime())
                        rsf.write('\n \n {}'.format(cur_time))
                        rsf.write('\n{}, {}, {}, {}'.format(uidx, pr, se, mse))
                    print "Done"

        print 'Seen %d samples' % n_samples