def testTrain(self): global model global embed_map main.save_vocab('test_dir', 'test_dir/dict.dat') self.assertTrue(os.path.exists('test_dir/dict.dat')) embed_map = tools.load_googlenews_vectors('word2vec.w2v', binary = True) train.trainer(list(main.load_corpus('test_dir')), saveto = 'test_dir/model', saveFreq = 10, n_words = 10) #you may want to change parameters saveFreq or n_words if you use other test corpus texts os.rename('test_dir/model.pkl', 'test_dir/model.npz.pkl') self.assertTrue(os.path.exists('test_dir/model.npz')) self.assertTrue(os.path.exists('test_dir/model.npz.pkl')) model = tools.load_model('test_dir/model.npz', 'test_dir/dict.dat', 'word2vec.w2v', embed_map) X_train, y_train = main.training_set(model, ['test_dir/train.csv']) self.assertEqual(len(X_train.shape), 2) self.assertEqual(len(y_train.shape), 1) self.assertEqual(X_train.shape[0], y_train.shape[0]) self.assertEqual(X_train.shape[1], 4800)
import tools import eval_sick import warnings warnings.filterwarnings('ignore', category=DeprecationWarning) if __name__ == '__main__': embed_map = tools.load_googlenews_vectors() model = tools.load_model(embed_map) eval_sick.evaluate(model, evaltest=True)
for item in parse[2].split(): f = item.split(':') d[int(f[0])] = float(f[1]) bowfeats.append(d) # model # load entities #labels = [] #ent = [] #with open(data_path, 'r') as f: # for line in f: # labels.append(line.split('\t')[1][:-1]) # ent.append(line.split('\t')[0]) if not reload_vectors: if not reload_model: embed_map = tools.load_googlenews_vectors() model = tools.load_model(embed_map) with open(model_save_path, 'w') as f: pkl.dump(model, f) else: model = pkl.load(open(model_save_path, 'r')) # process xp = [] for entity in ent: #line = re.sub(r'[0-9]+','#'," ".join([word for word in entity.split('_') if word not in stops])).lower() line = " ".join([word for word in entity.split('_') if word not in stops]).lower() xp.append(filter(lambda x: x in string.printable, line)) # encode new sentences vectors = tools.encode(model, xp)
with open(f) as file_descriptor: file_content = file_descriptor.read().decode("utf-8", "ignore") file_content = sent_tokenize(file_content) for sentence in file_content: if sentence: X.append(sentence.strip()) sentence_embeddings = np.empty([file_counter, 4800]) loc = base_path_to_directory + "dictionary.pkl" saveto = base_path_to_directory + "toy.npz" maxlen_w = 70 worddict, wordcount = vocab.build_dictionary(X) vocab.save_dictionary(worddict, wordcount, loc) #loc where you want to save dictionary #in train.py set 1>path for dictionary, 2>save_to -path where to save model 3>maxlen_w train.trainer(X, dictionary=loc, saveto=saveto, maxlen_w=maxlen_w) #In tools.py set path_to_model=save_to in train, path_to_dictionary=dictionary in train and path_to_word2vec. embed_map = tools.load_googlenews_vectors(path_to_word2vec) model = tools.load_model(embed_map) if not os.path.exists(SENTENCE_EMBEDDING_FOLDER): os.mkdir(SENTENCE_EMBEDDING_FOLDER) for f in FILES: with open(f) as file_descriptor: file_content = sent_tokenize(file_descriptor.read()) document_embedding = tools.encode(model, file_content, verbose=False) document_embedding = np.average(document_embedding, axis=0) file_name = f.split('/')[-1] np.save(SENTENCE_EMBEDDING_FOLDER + file_name[:-4], document_embedding)
def trainer( X, dim_word=620, # word vector dimensionality dim=2400, # the number of GRU units encoder='gru', decoder='gru', max_epochs=5, dispFreq=1, decay_c=0., grad_clip=5., n_words=20000, maxlen_w=30, optimizer='adam', batch_size=512, saveto='/u/rkiros/research/semhash/models/toy.npz', dictionary='/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl', saveFreq=5000, reload_=False, reload_path='output_books_full/model_ae_full_bsz_64_iter_313000.npz', SICK_eval=False): # Model options model_options = {} model_options['dim_word'] = dim_word model_options['dim'] = dim model_options['encoder'] = encoder model_options['decoder'] = decoder model_options['max_epochs'] = max_epochs model_options['dispFreq'] = dispFreq model_options['decay_c'] = decay_c model_options['grad_clip'] = grad_clip model_options['n_words'] = n_words model_options['maxlen_w'] = maxlen_w model_options['optimizer'] = optimizer model_options['batch_size'] = batch_size model_options['saveto'] = saveto model_options['dictionary'] = dictionary model_options['saveFreq'] = saveFreq model_options['reload_'] = reload_ model_options['reload_path'] = reload_path print model_options # reload options if reload_ and os.path.exists(reload_path): print 'reloading...' + reload_path with open('%s.pkl' % reload_path, 'rb') as f: models_options = pkl.load(f) reload_idx = int(reload_path.split('_')[-1].split('.')[0]) # load dictionary print 'Loading dictionary...' worddict = load_dictionary(dictionary) # Inverse dictionary word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' print 'Building model' params = init_params(model_options) # reload parameters if reload_ and os.path.exists(reload_path): params = load_params(reload_path, params) tparams = init_tparams(params) trng, x, x_mask, y, y_mask, z, z_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask, y, y_mask, z, z_mask] # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=False) print 'Done' # weight decay, if applicable if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=False) print 'Done' print 'Done' print 'Building f_grad...', grads = tensor.grad(cost, wrt=itemlist(tparams)) f_grad_norm = theano.function(inps, [(g**2).sum() for g in grads], profile=False) f_weight_norm = theano.function([], [(t**2).sum() for k, t in tparams.iteritems()], profile=False) if grad_clip > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append( tensor.switch(g2 > (grad_clip**2), g / tensor.sqrt(g2) * grad_clip, g)) grads = new_grads lr = tensor.scalar(name='lr') print 'Building optimizers...', # (compute gradients), (updates parameters) f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Optimization' # Each sentence in the minibatch have same length (for encoder) trainX = homogeneous_data.grouper(X) train_iter = homogeneous_data.HomogeneousData(trainX, batch_size=batch_size, maxlen=maxlen_w) if not reload_: uidx = 0 else: uidx = reload_idx lrate = 0.01 for eidx in xrange(max_epochs): n_samples = 0 print 'Epoch ', eidx for x, y, z in train_iter: n_samples += len(x) uidx += 1 x, x_mask, y, y_mask, z, z_mask = homogeneous_data.prepare_data( x, y, z, worddict, maxlen=maxlen_w, n_words=n_words) if x == None: print 'Minibatch with zero sample under length ', maxlen_w uidx -= 1 continue ud_start = time.time() cost = f_grad_shared(x, x_mask, y, y_mask, z, z_mask) f_update(lrate) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', saveto_iternum = saveto.format(uidx) params = unzip(tparams) numpy.savez(saveto_iternum, history_errs=[], **params) pkl.dump(model_options, open('%s.pkl' % saveto_iternum, 'wb')) print 'Done' if SICK_eval: print "Evaluating SICK Test performance" embed_map = tools.load_googlenews_vectors() model = tools.load_model(path_to_model=saveto_iternum, embed_map=embed_map) yhat, pr, sr, mse = eval_sick.evaluate(model, evaltest=True) del (model) del (embed_map) print pr, sr, mse res_save_file = saveto.format('ALL').split( '.')[0] + '_SICK_EVAL.txt' with open(res_save_file, 'a') as rsf: cur_time = strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()) rsf.write('\n \n {}'.format(cur_time)) rsf.write('\n{}, {}, {}, {}'.format(uidx, pr, se, mse)) print "Done" print 'Seen %d samples' % n_samples