def build_graph(self): args = self.args cost = self.all_loss meta_emb = self.meta_emb updates, lr, gnorm = create_optimization_updates( cost=cost, params=self.params, lr=args.learning_rate, method=args.learning)[:3] train_model = theano.function( inputs=[self.batch_ids, self.batch_masks], outputs=[cost, gnorm], updates=updates, allow_input_downcast=True) predict_model = theano.function( inputs=[self.batch_ids, self.batch_masks], outputs=cost, allow_input_downcast=True) embs_output = theano.function(inputs=[self.batch_ids], outputs=meta_emb.embs, allow_input_downcast=True) return train_model, predict_model, embs_output, self.params
def train(self, train, dev, test): args = self.args trainx, trainy = train batch_size = args.batch if dev: dev_batches_x, dev_batches_y = create_batches( range(len(dev[0])), dev[0], dev[1], batch_size) if test: test_batches_x, test_batches_y = create_batches( range(len(test[0])), test[0], test[1], batch_size) cost = self.nll_loss + self.l2_sqr updates, lr, gnorm = create_optimization_updates( cost=cost, params=self.params, lr=args.learning_rate, method=args.learning)[:3] train_model = theano.function(inputs=[self.x, self.y], outputs=[cost, gnorm], updates=updates, allow_input_downcast=True) eval_acc = theano.function(inputs=[self.x], outputs=self.pred, allow_input_downcast=True) unchanged = 0 best_dev = 0.0 dropout_prob = np.float64(args.dropout_rate).astype( theano.config.floatX) start_time = time.time() eval_period = args.eval_period perm = range(len(trainx)) say( str([ "%.2f" % np.linalg.norm(x.get_value(borrow=True)) for x in self.params ]) + "\n") for epoch in xrange(args.max_epochs): unchanged += 1 if unchanged > 20: return train_loss = 0.0 random.shuffle(perm) batches_x, batches_y = create_batches(perm, trainx, trainy, batch_size) N = len(batches_x) for i in xrange(N): if i % 100 == 0: sys.stdout.write("\r%d" % i) sys.stdout.flush() x = batches_x[i] y = batches_y[i] va, grad_norm = train_model(x, y) train_loss += va # debug if math.isnan(va): print "" print i - 1, i print x print y return if (i == N - 1) or (eval_period > 0 and (i + 1) % eval_period == 0): self.dropout.set_value(0.0) say("\n") say("Epoch %.1f\tloss=%.4f\t|g|=%s [%.2fm]\n" % (epoch + (i + 1) / (N + 0.0), train_loss / (i + 1), float(grad_norm), (time.time() - start_time) / 60.0)) say( str([ "%.2f" % np.linalg.norm(x.get_value(borrow=True)) for x in self.params ]) + "\n") if dev: preds = [eval_acc(x) for x in dev_batches_x] nowf_dev = self.eval_accuracy(preds, dev_batches_y) if nowf_dev > best_dev: unchanged = 0 best_dev = nowf_dev if args.save: self.save_model(args.save, args) say("\tdev accuracy=%.4f\tbest=%.4f\n" % (nowf_dev, best_dev)) if args.test and nowf_dev == best_dev: preds = [eval_acc(x) for x in test_batches_x] nowf_test = self.eval_accuracy( preds, test_batches_y) say("\ttest accuracy=%.4f\n" % (nowf_test, )) if best_dev > nowf_dev + 0.05: return self.dropout.set_value(dropout_prob) start_time = time.time()
def train(self, ids_corpus, train, dev=None, test=None, heldout=None): args = self.args dropout_prob = np.float64(args.dropout).astype(theano.config.floatX) batch_size = args.batch_size padding_id = self.padding_id bos_id = self.bos_id eos_id = self.eos_id #train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id, args.loss) updates, lr, gnorm = create_optimization_updates( cost=self.cost, params=self.params, lr=args.learning_rate, method=args.learning)[:3] train_func = theano.function(inputs=[self.idxs, self.idys], outputs=[self.cost, self.loss, gnorm], updates=updates) eval_func = theano.function( inputs=[self.idxs], #outputs = self.scores2 outputs=self.scores) nll_func = theano.function(inputs=[self.idxs, self.idys], outputs=[self.nll, self.mask]) say("\tp_norm: {}\n".format(self.get_pnorm_stat())) result_table = PrettyTable( ["Epoch", "dev MAP", "dev MRR", "dev P@1", "dev P@5"] + ["tst MAP", "tst MRR", "tst P@1", "tst P@5"]) unchanged = 0 best_dev = -1 dev_MAP = dev_MRR = dev_P1 = dev_P5 = 0 test_MAP = test_MRR = test_P1 = test_P5 = 0 heldout_PPL = -1 start_time = 0 max_epoch = args.max_epoch for epoch in xrange(max_epoch): unchanged += 1 if unchanged > 8: break start_time = time.time() train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id, bos_id, eos_id, auto_encode=True) N = len(train_batches) train_cost = 0.0 train_loss = 0.0 train_loss2 = 0.0 for i in xrange(N): # get current batch t1, b1, t2 = train_batches[i] if args.use_title: idxs, idys = myio.create_one_batch(t1, t2, padding_id) cur_cost, cur_loss, grad_norm = train_func(idxs, idys) train_cost += cur_cost train_loss += cur_loss train_loss2 += cur_loss / idys.shape[0] if args.use_body: idxs, idys = myio.create_one_batch(b1, t2, padding_id) cur_cost, cur_loss, grad_norm = train_func(idxs, idys) train_cost += cur_cost train_loss += cur_loss train_loss2 += cur_loss / idys.shape[0] if i % 10 == 0: say("\r{}/{}".format(i, N)) if i == N - 1: self.dropout.set_value(0.0) if dev is not None: dev_MAP, dev_MRR, dev_P1, dev_P5 = self.evaluate( dev, eval_func) if test is not None: test_MAP, test_MRR, test_P1, test_P5 = self.evaluate( test, eval_func) if heldout is not None: heldout_PPL = self.evaluate_perplexity( heldout, nll_func) if dev_MRR > best_dev: unchanged = 0 best_dev = dev_MRR result_table.add_row([epoch] + [ "%.2f" % x for x in [dev_MAP, dev_MRR, dev_P1, dev_P5] + [test_MAP, test_MRR, test_P1, test_P5] ]) if args.model: self.save_model(args.model + ".pkl.gz") dropout_p = np.float64(args.dropout).astype( theano.config.floatX) self.dropout.set_value(dropout_p) say("\r\n\n") say( ( "Epoch {}\tcost={:.3f}\tloss={:.3f} {:.3f}\t" \ +"\tMRR={:.2f},{:.2f}\tPPL={:.1f}\t|g|={:.3f}\t[{:.3f}m]\n" ).format( epoch, train_cost / (i+1), train_loss / (i+1), train_loss2 / (i+1), dev_MRR, best_dev, heldout_PPL, float(grad_norm), (time.time()-start_time)/60.0 )) say("\tp_norm: {}\n".format(self.get_pnorm_stat())) say("\n") say("{}".format(result_table)) say("\n")
def train(self, ids_corpus, train, dev=None, test=None): dropout_prob = np.float64(args.dropout).astype(theano.config.floatX) batch_size = args.batch_size padding_id = self.padding_id #train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id) updates, lr, gnorm = create_optimization_updates( cost = self.cost, params = self.params, lr = args.learning_rate, method = args.learning )[:3] train_func = theano.function( inputs = [ self.idts, self.idbs, self.idps ], outputs = [ self.cost, self.loss, gnorm ], updates = updates ) eval_func = theano.function( inputs = [ self.idts, self.idbs ], outputs = self.scores, on_unused_input='ignore' ) say("\tp_norm: {}\n".format( self.get_pnorm_stat() )) result_table = PrettyTable(["Epoch", "dev MAP", "dev MRR", "dev P@1", "dev P@5"] + ["tst MAP", "tst MRR", "tst P@1", "tst P@5"]) unchanged = 0 best_dev = -1 dev_MAP = dev_MRR = dev_P1 = dev_P5 = 0 test_MAP = test_MRR = test_P1 = test_P5 = 0 start_time = 0 max_epoch = args.max_epoch for epoch in xrange(max_epoch): unchanged += 1 if unchanged > 15: break start_time = time.time() train = myio.read_annotations(args.train) train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id, pad_left = not args.average) N =len(train_batches) train_loss = 0.0 train_cost = 0.0 for i in xrange(N): # get current batch idts, idbs, idps = train_batches[i] cur_cost, cur_loss, grad_norm = train_func(idts, idbs, idps) train_loss += cur_loss train_cost += cur_cost if i % 10 == 0: say("\r{}/{}".format(i,N)) if i == N-1: self.dropout.set_value(0.0) if dev is not None: dev_MAP, dev_MRR, dev_P1, dev_P5 = self.evaluate(dev, eval_func) if test is not None: test_MAP, test_MRR, test_P1, test_P5 = self.evaluate(test, eval_func) if dev_MRR > best_dev: unchanged = 0 best_dev = dev_MRR result_table.add_row( [ epoch ] + [ "%.2f" % x for x in [ dev_MAP, dev_MRR, dev_P1, dev_P5 ] + [ test_MAP, test_MRR, test_P1, test_P5 ] ] ) if args.save_model: self.save_model(args.save_model) dropout_p = np.float64(args.dropout).astype( theano.config.floatX) self.dropout.set_value(dropout_p) say("\r\n\n") say( ( "Epoch {}\tcost={:.3f}\tloss={:.3f}" \ +"\tMRR={:.2f},{:.2f}\t|g|={:.3f}\t[{:.3f}m]\n" ).format( epoch, train_cost / (i+1), train_loss / (i+1), dev_MRR, best_dev, float(grad_norm), (time.time()-start_time)/60.0 )) say("\tp_norm: {}\n".format( self.get_pnorm_stat() )) say("\n") say("{}".format(result_table)) say("\n")
def train(self, train, dev, test): args = self.args trainx, trainy = train batch_size = args.batch if dev: dev_batches_x, dev_batches_y = create_batches( range(len(dev[0])), dev[0], dev[1], batch_size ) if test: test_batches_x, test_batches_y = create_batches( range(len(test[0])), test[0], test[1], batch_size ) cost = self.nll_loss + self.l2_sqr updates, lr, gnorm = create_optimization_updates( cost = cost, params = self.params, lr = args.learning_rate, method = args.learning )[:3] train_model = theano.function( inputs = [self.x, self.y], outputs = [ cost, gnorm ], updates = updates, allow_input_downcast = True ) eval_acc = theano.function( inputs = [self.x], outputs = self.pred, allow_input_downcast = True ) unchanged = 0 best_dev = 0.0 dropout_prob = np.float64(args.dropout_rate).astype(theano.config.floatX) start_time = time.time() eval_period = args.eval_period perm = range(len(trainx)) say(str([ "%.2f" % np.linalg.norm(x.get_value(borrow=True)) for x in self.params ])+"\n") for epoch in xrange(args.max_epochs): unchanged += 1 if unchanged > 20: return train_loss = 0.0 random.shuffle(perm) batches_x, batches_y = create_batches(perm, trainx, trainy, batch_size) N = len(batches_x) for i in xrange(N): if i % 100 == 0: sys.stdout.write("\r%d" % i) sys.stdout.flush() x = batches_x[i] y = batches_y[i] va, grad_norm = train_model(x, y) train_loss += va # debug if math.isnan(va): print "" print i-1, i print x print y return if (i == N-1) or (eval_period > 0 and (i+1) % eval_period == 0): self.dropout.set_value(0.0) say( "\n" ) say( "Epoch %.1f\tloss=%.4f\t|g|=%s [%.2fm]\n" % ( epoch + (i+1)/(N+0.0), train_loss / (i+1), float(grad_norm), (time.time()-start_time) / 60.0 )) say(str([ "%.2f" % np.linalg.norm(x.get_value(borrow=True)) for x in self.params ])+"\n") if dev: preds = [ eval_acc(x) for x in dev_batches_x ] nowf_dev = self.eval_accuracy(preds, dev_batches_y) if nowf_dev > best_dev: unchanged = 0 best_dev = nowf_dev if args.save: self.save_model(args.save, args) say("\tdev accuracy=%.4f\tbest=%.4f\n" % ( nowf_dev, best_dev )) if args.test and nowf_dev == best_dev: preds = [ eval_acc(x) for x in test_batches_x ] nowf_test = self.eval_accuracy(preds, test_batches_y) say("\ttest accuracy=%.4f\n" % ( nowf_test, )) if best_dev > nowf_dev + 0.05: return self.dropout.set_value(dropout_prob) start_time = time.time()
def train(self, ids_corpus, train, dev=None, test=None, heldout=None): args = self.args dropout_prob = np.float64(args.dropout).astype(theano.config.floatX) batch_size = args.batch_size padding_id = self.padding_id bos_id = self.bos_id eos_id = self.eos_id #train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id, args.loss) updates, lr, gnorm = create_optimization_updates( cost = self.cost, params = self.params, lr = args.learning_rate, method = args.learning )[:3] train_func = theano.function( inputs = [ self.idxs, self.idys ], outputs = [ self.cost, self.loss, gnorm ], updates = updates ) eval_func = theano.function( inputs = [ self.idxs ], #outputs = self.scores2 outputs = self.scores ) nll_func = theano.function( inputs = [ self.idxs, self.idys ], outputs = [ self.nll, self.mask ] ) say("\tp_norm: {}\n".format( self.get_pnorm_stat() )) result_table = PrettyTable(["Epoch", "dev MAP", "dev MRR", "dev P@1", "dev P@5"] + ["tst MAP", "tst MRR", "tst P@1", "tst P@5"]) unchanged = 0 best_dev = -1 dev_MAP = dev_MRR = dev_P1 = dev_P5 = 0 test_MAP = test_MRR = test_P1 = test_P5 = 0 heldout_PPL = -1 start_time = 0 max_epoch = args.max_epoch for epoch in xrange(max_epoch): unchanged += 1 if unchanged > 8: break start_time = time.time() train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id, bos_id, eos_id, auto_encode=True) N =len(train_batches) train_cost = 0.0 train_loss = 0.0 train_loss2 = 0.0 for i in xrange(N): # get current batch t1, b1, t2 = train_batches[i] if args.use_title: idxs, idys = myio.create_one_batch(t1, t2, padding_id) cur_cost, cur_loss, grad_norm = train_func(idxs, idys) train_cost += cur_cost train_loss += cur_loss train_loss2 += cur_loss / idys.shape[0] if args.use_body: idxs, idys = myio.create_one_batch(b1, t2, padding_id) cur_cost, cur_loss, grad_norm = train_func(idxs, idys) train_cost += cur_cost train_loss += cur_loss train_loss2 += cur_loss / idys.shape[0] if i % 10 == 0: say("\r{}/{}".format(i,N)) if i == N-1: self.dropout.set_value(0.0) if dev is not None: dev_MAP, dev_MRR, dev_P1, dev_P5 = self.evaluate(dev, eval_func) if test is not None: test_MAP, test_MRR, test_P1, test_P5 = self.evaluate(test, eval_func) if heldout is not None: heldout_PPL = self.evaluate_perplexity(heldout, nll_func) if dev_MRR > best_dev: unchanged = 0 best_dev = dev_MRR result_table.add_row( [ epoch ] + [ "%.2f" % x for x in [ dev_MAP, dev_MRR, dev_P1, dev_P5 ] + [ test_MAP, test_MRR, test_P1, test_P5 ] ] ) if args.model: self.save_model(args.model+".pkl.gz") dropout_p = np.float64(args.dropout).astype( theano.config.floatX) self.dropout.set_value(dropout_p) say("\r\n\n") say( ( "Epoch {}\tcost={:.3f}\tloss={:.3f} {:.3f}\t" \ +"\tMRR={:.2f},{:.2f}\tPPL={:.1f}\t|g|={:.3f}\t[{:.3f}m]\n" ).format( epoch, train_cost / (i+1), train_loss / (i+1), train_loss2 / (i+1), dev_MRR, best_dev, heldout_PPL, float(grad_norm), (time.time()-start_time)/60.0 )) say("\tp_norm: {}\n".format( self.get_pnorm_stat() )) say("\n") say("{}".format(result_table)) say("\n")
def train(self, args, train, dev, test=None): embedding_layer = self.layers[-2] dropout_prob = np.float64(args["dropout"]).astype(theano.config.floatX) rnn_dropout_prob = np.float64(args["rnn_dropout"]).astype( theano.config.floatX) batch_size = args["batch_size"] unroll_size = args["unroll_size"] train = create_batches(train, embedding_layer.map_to_ids, batch_size) dev = create_batches(dev, embedding_layer.map_to_ids, 1) if test is not None: test = create_batches(test, embedding_layer.map_to_ids, 1) cost = T.sum(self.nll) / self.idxs.shape[1] updates, lr, gnorm = create_optimization_updates( cost=cost, params=self.params, lr=args["learning_rate"], eps=args["eps"], method=args["learning"])[:3] train_func = theano.function(inputs=[self.idxs, self.idys] + self.init_state, outputs=[cost, gnorm] + self.last_state, updates=updates) eval_func = theano.function( inputs=[self.idxs, self.idys] + self.init_state, outputs=[self.nll] + self.last_state, ) N = (len(train[0]) - 1) / unroll_size + 1 say(" train: {} tokens, {} mini-batches\n".format( len(train[0].ravel()), N)) say(" dev: {} tokens\n".format(len(dev[0].ravel()))) say("\tp_norm: {}\n".format(self.get_pnorm_stat())) decay_epoch = args["lr_decay_epoch"] decay_rate = args["lr_decay"] lr_0 = args["learning_rate"] iter_cnt = 0 depth = args["depth"] unchanged = 0 best_dev = 1e+10 start_time = 0 max_epoch = args["max_epoch"] for epoch in xrange(max_epoch): unchanged += 1 if unchanged > 20: break if decay_epoch > 0 and epoch >= decay_epoch: lr.set_value(np.float32(lr.get_value() * decay_rate)) start_time = time.time() prev_state = [ np.zeros((batch_size, self.n_d), dtype=theano.config.floatX) for i in xrange(depth * 2) ] train_loss = 0.0 for i in xrange(N): # get current batch x = train[0][i * unroll_size:(i + 1) * unroll_size] y = train[1][i * unroll_size:(i + 1) * unroll_size] iter_cnt += 1 ret = train_func(x, y, *prev_state) cur_loss, grad_norm, prev_state = ret[0], ret[1], ret[2:] train_loss += cur_loss / len(x) if i % 10 == 0: say("\r{}".format(i)) if i == N - 1: self.dropout.set_value(0.0) self.rnn_dropout.set_value(0.0) dev_preds = self.evaluate(eval_func, dev, 1, unroll_size) dev_loss = evaluate_average(predictions=dev_preds, masks=None) dev_ppl = np.exp(dev_loss) self.dropout.set_value(dropout_prob) self.rnn_dropout.set_value(rnn_dropout_prob) say("\r\n") say( ( "Epoch={} lr={:.4f} train_loss={:.3f} train_ppl={:.1f} " \ +"dev_loss={:.3f} dev_ppl={:.1f}\t|g|={:.3f}\t[{:.1f}m]\n" ).format( epoch, float(lr.get_value(borrow=True)), train_loss/N, np.exp(train_loss/N), dev_loss, dev_ppl, float(grad_norm), (time.time()-start_time)/60.0 )) say("\tp_norm: {}\n".format(self.get_pnorm_stat())) if dev_ppl < best_dev: best_dev = dev_ppl if test is None: continue self.dropout.set_value(0.0) self.rnn_dropout.set_value(0.0) test_preds = self.evaluate(eval_func, test, 1, unroll_size) test_loss = evaluate_average(predictions=test_preds, masks=None) test_ppl = np.exp(test_loss) self.dropout.set_value(dropout_prob) self.rnn_dropout.set_value(rnn_dropout_prob) say("\tbest_dev={:.1f} test_loss={:.3f} test_ppl={:.1f}\n" .format(best_dev, test_loss, test_ppl)) if best_dev < 200: unchanged = 0 say("\n")
def train(self, train, dev, test, rationale_data, trained_max_epochs=None): args = self.args args.trained_max_epochs = self.trained_max_epochs = trained_max_epochs dropout = self.dropout padding_id = self.embedding_layer.vocab_map["<padding>"] if dev is not None: dev_batches_x, dev_batches_y = myio.create_batches( dev[0], dev[1], args.batch, padding_id) if test is not None: test_batches_x, test_batches_y = myio.create_batches( test[0], test[1], args.batch, padding_id) if rationale_data is not None: valid_batches_x, valid_batches_y = myio.create_batches( [u["xids"] for u in rationale_data], [u["y"] for u in rationale_data], args.batch, padding_id, sort=False) start_time = time.time() train_batches_x, train_batches_y = myio.create_batches( train[0], train[1], args.batch, padding_id) say("{:.2f}s to create training batches\n\n".format(time.time() - start_time)) updates_e, lr_e, gnorm_e = create_optimization_updates( cost=self.encoder.cost_e, params=self.encoder.params, method=args.learning, beta1=args.beta1, beta2=args.beta2, lr=args.learning_rate)[:3] updates_g, lr_g, gnorm_g = create_optimization_updates( cost=self.encoder.cost_g, params=self.generator.params, method=args.learning, beta1=args.beta1, beta2=args.beta2, lr=args.learning_rate)[:3] sample_generator = theano.function( inputs=[self.x], outputs=self.z, #updates = self.generator.sample_updates ) get_loss_and_pred = theano.function( inputs=[self.x, self.y], outputs=[self.encoder.loss_vec, self.encoder.preds, self.z], #updates = self.generator.sample_updates ) eval_generator = theano.function( inputs=[self.x, self.y], outputs=[ self.z, self.encoder.obj, self.encoder.loss, self.encoder.pred_diff ], #updates = self.generator.sample_updates ) sample_generator = theano.function( inputs=[self.x], outputs=self.z, #updates = self.generator.sample_updates ) sample_encoder = theano.function( inputs=[self.x, self.y, self.z], outputs=[ self.encoder.obj, self.encoder.loss, self.encoder.pred_diff ], #updates = self.generator.sample_updates ) train_generator = theano.function( inputs = [ self.x, self.y ], outputs = [ self.encoder.obj, self.encoder.loss, \ self.encoder.sparsity_cost, self.z, self.word_embs, gnorm_e, gnorm_g ], updates = updates_e.items() + updates_g.items() #+ self.generator.sample_updates, ) eval_period = args.eval_period unchanged = 0 best_dev = 1e+2 best_dev_e = 1e+2 last_train_avg_cost = None last_dev_avg_cost = None tolerance = 0.10 + 1e-3 dropout_prob = np.float64(args.dropout).astype(theano.config.floatX) for epoch_ in xrange(args.max_epochs - 50): # -50 when max_epochs = 100 given #print(" max epochs in train func: ", args.max_epochs) epoch = args.trained_max_epochs + epoch_ unchanged += 1 if unchanged > 25: print 'dev set increases more than 25 times after the best dev found' #return train_batches_x, train_batches_y = myio.create_batches( train[0], train[1], args.batch, padding_id) more = True if args.decay_lr: param_bak = [p.get_value(borrow=False) for p in self.params] start_train_generate = time.time() more_counter = 0 while more: processed = 0 train_cost = 0.0 train_loss = 0.0 train_sparsity_cost = 0.0 p1 = 0.0 start_time = time.time() N = len(train_batches_x) #print(" begining : ", train_cost ) for i in xrange(N): if (i + 1) % 100 == 0: say("\r{}/{} {:.2f} ".format( i + 1, N, p1 / (i + 1))) bx, by = train_batches_x[i], train_batches_y[i] mask = bx != padding_id start_train_time = time.time() cost, loss, sparsity_cost, bz, emb, gl2_e, gl2_g = train_generator( bx, by) #print('gl2_g: ' , gl2_g) k = len(by) processed += k train_cost += cost train_loss += loss train_sparsity_cost += sparsity_cost p1 += np.sum(bz * mask) / (np.sum(mask) + 1e-8) cur_train_avg_cost = train_cost / N #print(" end : ", cur_train_avg_cost ) say("train generate time: {} \n".format(time.time() - start_train_generate)) if dev: self.dropout.set_value(0.0) start_dev_time = time.time() dev_obj, dev_loss, dev_diff, dev_p1 = self.evaluate_data( dev_batches_x, dev_batches_y, eval_generator, sampling=True) self.dropout.set_value(dropout_prob) say("dev evaluate data time: {} \n".format(time.time() - start_dev_time)) cur_dev_avg_cost = dev_obj more = False if args.decay_lr and last_train_avg_cost is not None: if cur_train_avg_cost > last_train_avg_cost * (1 + tolerance): more = True say("\nTrain cost {} --> {}\n".format( last_train_avg_cost, cur_train_avg_cost)) if dev and cur_dev_avg_cost > last_dev_avg_cost * ( 1 + tolerance): more = True say("\nDev cost {} --> {}\n".format( last_dev_avg_cost, cur_dev_avg_cost)) if more: more_counter += 1 if more_counter < 20: more = False if more: more_counter = 0 lr_val = lr_g.get_value() * 0.5 lr_val = np.float64(lr_val).astype(theano.config.floatX) lr_g.set_value(lr_val) lr_e.set_value(lr_val) say("Decrease learning rate to {} at epoch {}\n".format( float(lr_val), epoch_ + 1)) for p, v in zip(self.params, param_bak): #print ('param restoreing: ', p, v) p.set_value(v) continue last_train_avg_cost = cur_train_avg_cost if dev: last_dev_avg_cost = cur_dev_avg_cost say("\n") say(( "Generator Epoch {:.2f} costg={:.4f} scost={:.4f} lossg={:.4f} " + "p[1]={:.2f} |g|={:.4f} {:.4f}\t[{:.2f}m / {:.2f}m]\n" ).format(epoch + (i + 1.0) / N, train_cost / N, train_sparsity_cost / N, train_loss / N, p1 / N, float(gl2_e), float(gl2_g), (time.time() - start_time) / 60.0, (time.time() - start_time) / 60.0 / (i + 1) * N)) say("\t"+str([ "{:.2f}".format(np.linalg.norm(x.get_value(borrow=True))) \ for x in self.encoder.params ])+"\n") say("\t"+str([ "{:.2f}".format(np.linalg.norm(x.get_value(borrow=True))) \ for x in self.generator.params ])+"\n") say("total encode time = {} total geneartor time = {} \n". format(total_encode_time, total_generate_time)) if epoch_ % args.save_every == 0: #and epoch_>0: print 'saving model after epoch -', epoch_ + 1, ' file name: ', args.save_model + str( epoch_) self.save_model(args.save_model + str(epoch_), args) if dev: if dev_obj < best_dev: best_dev = dev_obj unchanged = 0 if args.dump and rationale_data: self.dump_rationales(args.dump, valid_batches_x, valid_batches_y, get_loss_and_pred, sample_generator) if args.save_model: print 'saving best model after epoch -', epoch_ + 1, ' file name: ', args.save_model self.save_model(args.save_model, args) say(( "\tsampling devg={:.4f} mseg={:.4f} avg_diffg={:.4f}" + " p[1]g={:.2f} best_dev={:.4f}\n").format( dev_obj, dev_loss, dev_diff, dev_p1, best_dev)) if rationale_data is not None: self.dropout.set_value(0.0) start_rational_time = time.time() #r_mse, r_p1, r_prec1, r_prec2 = self.evaluate_rationale( # rationale_data, valid_batches_x, # valid_batches_y, eval_generator) r_mse, r_p1, r_prec1, r_prec2, gen_time, enc_time, prec_cal_time = self.evaluate_rationale( rationale_data, valid_batches_x, valid_batches_y, sample_generator, sample_encoder, eval_generator) self.dropout.set_value(dropout_prob) say(( "\trationale mser={:.4f} p[1]r={:.2f} prec1={:.4f}" + " prec2={:.4f} time nedded for rational={}\n" ).format(r_mse, r_p1, r_prec1, r_prec2, time.time() - start_rational_time))
def train(self, train, dev, test): args = self.args x_train, y_train = train batch = args.batch test_batch = args.test_batch score_scale = args.score_scale if dev: x_dev_batches, y_dev_batches, ay_dev_batches, ayy_dev_batches, ay_mask_dev_batches, w_mask_dev_batches, w_len_dev_batches, sent_maxlen_dev_batches, sent_num_dev_batches= create_batches( range(len(dev[0])), dev[0], dev[1], test_batch, score_scale ) if test: x_test_batches, y_test_batches, ay_test_batches, ayy_test_batches, ay_mask_test_batches, w_mask_test_batches, w_len_test_batches, sent_maxlen_test_batches, sent_num_test_batches = create_batches( range(len(test[0])), test[0], test[1], test_batch, score_scale ) cost = self.l2_sqr + self.nll_loss_ay print 'Building graph...' updates, lr, gnorm = create_optimization_updates( cost = cost, params = self.params, lr = args.learning_rate, method = args.learning )[:3] train_model = theano.function( inputs = [self.x, self.y, self.ay, self.aay, self.ay_mask, self.w_masks, self.w_lens, self.s_maxlen, self.s_num], outputs = [ cost, gnorm ], updates = updates, allow_input_downcast = True ) eval_acc = theano.function( inputs = [self.x, self.w_masks, self.w_lens, self.s_maxlen, self.s_num], outputs = [self.pred_ay], #, self.output], allow_input_downcast = True ) unchanged = 0 best_dev_result = 0.0 dropout_rate = np.float64(args.dropout_rate).astype(theano.config.floatX) start_time = time.time() eval_period = args.eval_period perm = range(len(x_train)) say(str([ "%.2f" % np.linalg.norm(x.get_value(borrow=True)) for x in self.params ])+"\n") if args.load: self.dropout.set_value(0.0) preds = [ eval_acc( x, wm, wl, sm, sn ) for x, wm, wl, sm, sn in zip(x_dev_batches, w_mask_dev_batches, w_len_dev_batches, sent_maxlen_dev_batches, sent_num_dev_batches)] ay_pred = [ pred[0] for pred in preds ] results = self.eval_accuracy(ay_pred, ay_dev_batches) best_dev_result = results[1] say("\tDEV RMSE/BEST_ACCUARCY/ACCURACY=%.4f_%.4f_%.4f\n" % ( results[0], best_dev_result, results[1] )) preds = [ eval_acc( x, wm, wl, sm, sn ) for x, wm, wl, sm, sn in zip(x_test_batches, w_mask_test_batches, w_len_test_batches, sent_maxlen_test_batches, sent_num_test_batches)] ay_pred = [ pred[0] for pred in preds ] results = self.eval_accuracy(ay_pred, ay_test_batches) say("\tTEST RMSE/ACCURACY=%.4f_%.4f\n" % ( results[0], results[1], )) for epoch in xrange(args.max_epochs): self.dropout.set_value(dropout_rate) unchanged += 1 if unchanged > 20: return train_loss = 0.0 random.shuffle(perm) x_batches, y_batches, ay_batches, aay_batches, ay_mask_batches, w_mask_batches, w_len_batches, sent_maxlen_batches, sent_num_batches = create_batches(perm, x_train, y_train, batch, score_scale) N = len(x_batches) for i in xrange(N): if (i + 1) % 100 == 0: sys.stdout.write("\r%d" % i) sys.stdout.flush() x = x_batches[i] y = y_batches[i] va, grad_norm = train_model(x, y, ay_batches[i], aay_batches[i], ay_mask_batches[i], w_mask_batches[i], w_len_batches[i], sent_maxlen_batches[i], sent_num_batches[i]) train_loss += va # debug if math.isnan(va): return if (i == N-1) or (eval_period > 0 and (i+1) % eval_period == 0): self.dropout.set_value(0.0) say( "\n" ) say( "Epoch %.3f\tloss=%.4f\t|g|=%s [%.2fm]\n" % ( epoch + (i+1)/(N+0.0), train_loss / (i+1), float(grad_norm), (time.time()-start_time) / 60.0 )) say(str([ "%.2f" % np.linalg.norm(x.get_value(borrow=True)) for x in self.params ])+"\n") if dev: preds = [ eval_acc(x, wm, wl, sm, sn) for x, wm, wl, sm, sn in zip(x_dev_batches, w_mask_dev_batches, w_len_dev_batches, sent_maxlen_dev_batches, sent_num_dev_batches)] ay_pred = [ pred[0] for pred in preds ] results = self.eval_accuracy(ay_pred, ay_dev_batches) say("\tDEV RMSE/BEST_ACCUARCY/ACCURACY=%.4f_%.4f_%.4f\n" % ( results[0], best_dev_result, results[1] )) if results[1] > best_dev_result: unchanged = 0 best_dev_result = results[1] if args.save: self.save_model(args.save, args) preds = [ eval_acc(x, wm, wl, sm, sn) for x, wm, wl, sm, sn in zip(x_test_batches, w_mask_test_batches, w_len_test_batches, sent_maxlen_test_batches, sent_num_test_batches)] ay_pred = [ pred[0] for pred in preds ] results_test = self.eval_accuracy(ay_pred, ay_test_batches) say("\tTEST RMSE/ACCURACY=%.4f_%.4f\n" % ( results_test[0], results_test[1] )) if best_dev_result > results[0] + 0.2: return self.dropout.set_value(dropout_rate) start_time = time.time()
def train(self, args, train, dev, test=None): embedding_layer = self.layers[0] dropout_prob = np.float64(args["dropout"]).astype(theano.config.floatX) batch_size = args["batch_size"] unroll_size = args["unroll_size"] train = create_batches(train, embedding_layer.map_to_ids, batch_size) dev = create_batches(dev, embedding_layer.map_to_ids, batch_size) if test is not None: test = create_batches(test, embedding_layer.map_to_ids, batch_size) cost = T.sum(self.nll) / self.idxs.shape[1] updates, lr, gnorm = create_optimization_updates( cost=cost, params=self.params, lr=args["learning_rate"], beta1=args["beta1"], beta2=args["beta2"], rho=args["rho"], momentum=args["momentum"], gamma=args["gamma"], eps=args["eps"], method=args["learning"])[:3] #if args["learning"] == "adadelta": # lr.set_value(args["learning_rate"]) train_func = theano.function( inputs=[self.idxs, self.idys, self.init_state], outputs=[cost, self.last_state, gnorm], updates=updates) eval_func = theano.function( inputs=[self.idxs, self.idys, self.init_state], outputs=[self.nll, self.last_state]) N = (len(train[0]) - 1) / unroll_size + 1 say(" train: {} tokens, {} mini-batches\n".format( len(train[0].ravel()), N)) say(" dev: {} tokens\n".format(len(dev[0].ravel()))) say("\tp_norm: {}\n".format(self.get_pnorm_stat())) decay_lr = args["decay_lr"] and args["learning"].lower() != "adadelta" and \ args["learning"].lower() != "adagrad" lr_0 = args["learning_rate"] iter_cnt = 0 unchanged = 0 best_dev = 1e+10 start_time = 0 max_epoch = args["max_epoch"] for epoch in xrange(max_epoch): if unchanged > 5: break start_time = time.time() prev_state = np.zeros((batch_size, self.n_d * 2), dtype=theano.config.floatX) train_loss = 0.0 for i in xrange(N): # get current batch x = train[0][i * unroll_size:(i + 1) * unroll_size] y = train[1][i * unroll_size:(i + 1) * unroll_size] iter_cnt += 1 if decay_lr: lr.set_value(np.float32(lr_0 / iter_cnt**0.5)) cur_loss, prev_state, grad_norm = train_func(x, y, prev_state) train_loss += cur_loss / len(x) if math.isnan(cur_loss) or math.isnan(grad_norm): say("\nNaN !!\n") return if i % 10 == 0: say("\r{}".format(i)) if i == N - 1: self.dropout.set_value(0.0) dev_preds = self.evaluate(eval_func, dev, batch_size, unroll_size) dev_loss = evaluate_average(predictions=dev_preds, masks=None) dev_ppl = np.exp(dev_loss) self.dropout.set_value(dropout_prob) say("\r\n") say( ( "Epoch={} lr={:.3f} train_loss={:.3f} train_ppl={:.1f} " \ +"dev_loss={:.3f} dev_ppl={:.1f}\t|g|={:.3f}\t[{:.1f}m]\n" ).format( epoch, float(lr.get_value(borrow=True)), train_loss/N, np.exp(train_loss/N), dev_loss, dev_ppl, float(grad_norm), (time.time()-start_time)/60.0 )) say("\tp_norm: {}\n".format(self.get_pnorm_stat())) # halve the learning rate #if args["learning"] == "sgd" and dev_ppl > best_dev-1: # lr.set_value(np.max([lr.get_value()/2.0, np.float32(0.0001)])) if dev_ppl < best_dev: best_dev = dev_ppl if test is None: continue self.dropout.set_value(0.0) test_preds = self.evaluate(eval_func, test, batch_size, unroll_size) test_loss = evaluate_average(predictions=test_preds, masks=None) test_ppl = np.exp(test_loss) self.dropout.set_value(dropout_prob) say("\tbest_dev={:.1f} test_loss={:.3f} test_ppl={:.1f}\n" .format(best_dev, test_loss, test_ppl)) if best_dev > 200: unchanged += 1 say("\n")
def train(self): args = self.args train_x, train_y = self.train_set dev_x, dev_y = self.dev_set test_x, test_y = self.test_set updates, lr, gnorm = create_optimization_updates( cost = self.cost, params = self.params, lr = args.learning_rate, rho = args.rho, beta1 = args.beta1, beta2 = args.beta2, momentum = args.momentum, gamma = args.gamma, method = args.learning )[:3] batch = args.batch index = self.index x = self.x y = self.y train_func = theano.function( inputs = [ index ], outputs = [ self.cost, gnorm ], givens = { x: train_x[index*batch:(index+1)*batch], y: train_y[index*batch:(index+1)*batch] }, updates = updates ) dev_func = theano.function( inputs = [ index ], outputs = [ self.err, self.loss ], givens = { x: dev_x[index*batch:(index+1)*batch], y: dev_y[index*batch:(index+1)*batch] } ) test_func = theano.function( inputs = [ index ], outputs = [ self.err, self.loss ], givens = { x: test_x[index*batch:(index+1)*batch], y: test_y[index*batch:(index+1)*batch] } ) decay_lr = args.decay_lr and args.learning.lower() != "adadelta" and \ args.learning.lower() != "adagrad" lr_0 = args.learning_rate iter_cnt = 0 N = train_x.get_value(borrow=True).shape[0] num_batches = (N-1)/batch + 1 processed = 0 period = args.eval_period best_dev_err = 1.0 max_epochs = args.max_epochs for epoch in xrange(max_epochs): start_time = time.time() tot_cost = 0 for i in xrange(num_batches): iter_cnt += 1 if decay_lr: lr.set_value(np.float32(lr_0/iter_cnt**0.5)) cost, grad_norm = train_func(i) tot_cost += cost if math.isnan(cost): say("NaN !!\n") return ed = min(N, (i+1)*batch) prev = processed/period processed += ed-i*batch if (i == num_batches-1) or (processed/period > prev): say("Epoch={:.1f} Sample={} cost={:.4f} |g|={:.2f}\t[{:.1f}m]\n".format( epoch + (i+1.0)/num_batches, processed, tot_cost/(i+1), float(grad_norm), (time.time()-start_time)/60.0 )) dev_err, dev_loss = self.evaluate(dev_func, dev_x) best_dev_err = min(best_dev_err, dev_err) say("\tdev_err={:.4f} dev_loss={:.4f} best_dev={:.4f}\n".format( dev_err, dev_loss, best_dev_err)) if dev_err == best_dev_err: test_err, test_loss = self.evaluate(test_func, test_x) say("\ttest_err={:.4f} test_loss={:.4f}\n".format( test_err, test_loss)) say("\n")
def train(self, ids_corpus, train, dev=None, test=None): dropout_prob = np.float64(args.dropout).astype(theano.config.floatX) batch_size = args.batch_size padding_id = self.padding_id #train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id) if dev is not None: dev, dev_raw = dev if test is not None: test, test_raw = test if args.joint: updates_e, lr_e, gnorm_e = create_optimization_updates( cost=self.encoder.cost_e, #self.encoder.cost, params=self.encoder.params, lr=args.learning_rate * 0.1, method=args.learning)[:3] else: updates_e = {} updates_g, lr_g, gnorm_g = create_optimization_updates( cost=self.encoder.cost_g, params=self.generator.params, lr=args.learning_rate, method=args.learning)[:3] train_func = theano.function( inputs = [ self.x, self.triples, self.pairs ], outputs = [ self.encoder.obj, self.encoder.loss, \ self.encoder.sparsity_cost, self.generator.p1, gnorm_g ], # updates = updates_g.items() + updates_e.items() + self.generator.sample_updates, updates = collections.OrderedDict(list(updates_g.items()) + list(updates_e.items()) + list(self.generator.sample_updates.items())), #no_default_updates = True, on_unused_input= "ignore" ) eval_func = theano.function(inputs=[self.x], outputs=self.encoder.scores) eval_func2 = theano.function( inputs=[self.x], outputs=[self.encoder.scores_z, self.generator.p1, self.z], updates=self.generator.sample_updates, #no_default_updates = True ) say("\tp_norm: {}\n".format(self.get_pnorm_stat(self.encoder.params))) say("\tp_norm: {}\n".format(self.get_pnorm_stat( self.generator.params))) result_table = PrettyTable( ["Epoch", "dev MAP", "dev MRR", "dev P@1", "dev P@5"] + ["tst MAP", "tst MRR", "tst P@1", "tst P@5"]) last_train_avg_cost = None tolerance = 0.5 + 1e-3 unchanged = 0 best_dev = -1 dev_MAP = dev_MRR = dev_P1 = dev_P5 = 0 test_MAP = test_MRR = test_P1 = test_P5 = 0 start_time = 0 max_epoch = args.max_epoch for epoch in range(max_epoch): unchanged += 1 if unchanged > 20: break start_time = time.time() train = myio.read_annotations(args.train) train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id, pad_left=not args.average, merge=args.merge) N = len(train_batches) more = True param_bak = [p.get_value(borrow=False) for p in self.params] while more: train_loss = 0.0 train_cost = 0.0 train_scost = 0.0 train_p1 = 0.0 for i in range(N): # get current batch idts, triples, pairs = train_batches[i] cur_cost, cur_loss, cur_scost, cur_p1, gnormg = train_func( idts, triples, pairs) train_loss += cur_loss train_cost += cur_cost train_scost += cur_scost train_p1 += cur_p1 if i % 10 == 0: say("\r{}/{} {:.3f}".format(i, N, train_p1 / (i + 1))) cur_train_avg_cost = train_cost / N more = False if last_train_avg_cost is not None: if cur_train_avg_cost > last_train_avg_cost * (1 + tolerance): more = True say("\nTrain cost {} --> {}\n".format( last_train_avg_cost, cur_train_avg_cost)) if more: lr_val = lr_g.get_value() * 0.5 if lr_val < 1e-5: return lr_val = np.float64(lr_val).astype(theano.config.floatX) lr_g.set_value(lr_val) lr_e.set_value(lr_val) say("Decrease learning rate to {}\n".format(float(lr_val))) for p, v in zip(self.params, param_bak): p.set_value(v) continue last_train_avg_cost = cur_train_avg_cost say("\r\n\n") say( ( "Epoch {} cost={:.3f} loss={:.3f} scost={:.3f}" \ +" P[1]={:.3f} |g|={:.3f}\t[{:.3f}m]\n" ).format( epoch, train_cost / N, train_loss / N, train_scost / N, train_p1 / N, float(gnormg), (time.time()-start_time)/60.0 )) say("\tp_norm: {}\n".format( self.get_pnorm_stat(self.encoder.params))) say("\tp_norm: {}\n".format( self.get_pnorm_stat(self.generator.params))) self.dropout.set_value(0.0) if dev is not None: full_MAP, full_MRR, full_P1, full_P5 = self.evaluate( dev, eval_func) dev_MAP, dev_MRR, dev_P1, dev_P5, dev_PZ1, dev_PT = self.evaluate_z( dev, dev_raw, ids_corpus, eval_func2) if test is not None: test_MAP, test_MRR, test_P1, test_P5, test_PZ1, test_PT = \ self.evaluate_z(test, test_raw, ids_corpus, eval_func2) if dev_MAP > best_dev: best_dev = dev_MAP unchanged = 0 say("\n") say(" fMAP={:.2f} fMRR={:.2f} fP1={:.2f} fP5={:.2f}\n".format( full_MAP, full_MRR, full_P1, full_P5)) say("\n") say((" dMAP={:.2f} dMRR={:.2f} dP1={:.2f} dP5={:.2f}" + " dP[1]={:.3f} d%T={:.3f} best_dev={:.2f}\n").format( dev_MAP, dev_MRR, dev_P1, dev_P5, dev_PZ1, dev_PT, best_dev)) result_table.add_row([epoch] + [ "%.2f" % x for x in [dev_MAP, dev_MRR, dev_P1, dev_P5] + [test_MAP, test_MRR, test_P1, test_P5] ]) if unchanged == 0: say("\n") say((" tMAP={:.2f} tMRR={:.2f} tP1={:.2f} tP5={:.2f}" + " tP[1]={:.3f} t%T={:.3f}\n").format( test_MAP, test_MRR, test_P1, test_P5, test_PZ1, test_PT)) if args.dump_rationale: self.evaluate_z(dev + test, dev_raw + test_raw, ids_corpus, eval_func2, args.dump_rationale) #if args.save_model: # self.save_model(args.save_model) dropout_p = np.float64(args.dropout).astype( theano.config.floatX) self.dropout.set_value(dropout_p) say("\n") say("{}".format(result_table)) say("\n") if train_p1 / N <= 1e-4 or train_p1 / N + 1e-4 >= 1.0: break
def train(self, train, dev, test): args = self.args trainx, trainy, trainu = train batch_size = args.batch if dev: dev_batches_x, dev_batches_y, dev_batches_u, dev_batches_w_masks, dev_batches_w_lens, dev_batches_sent_maxlen, dev_batches_sent_num = create_batches_doc( range(len(dev[0])), dev[0], dev[1], dev[2], 128 ) if test: test_batches_x, test_batches_y, test_batches_u, test_batches_w_masks, test_batches_w_lens, test_batches_sent_maxlen, test_batches_sent_num = create_batches_doc( range(len(test[0])), test[0], test[1], test[2], 128 ) cost = self.nll_loss + self.l2_sqr updates, lr, gnorm = create_optimization_updates( cost=cost, params=self.params, lr=args.learning_rate, method=args.learning )[:3] train_model = theano.function( inputs=[self.x, self.y, self.usr, self.w_masks, self.w_lens, self.s_ml, self.s_num], outputs=[cost, gnorm], updates=updates, allow_input_downcast=True ) eval_acc = theano.function( inputs=[self.x, self.usr, self.w_masks, self.w_lens, self.s_ml, self.s_num], outputs=self.pred, allow_input_downcast=True ) unchanged = 0 best_dev = 0.0 dropout_prob = np.float64( args.dropout_rate).astype(theano.config.floatX) start_time = time.time() eval_period = args.eval_period perm = range(len(trainx)) say(str(["%.2f" % np.linalg.norm(x.get_value(borrow=True)) for x in self.params]) + "\n") if args.load: self.dropout.set_value(0.0) preds = [eval_acc(x, u, wm, wl, sm, sn) for x, u, wm, wl, sm, sn in zip( dev_batches_x, dev_batches_u, dev_batches_w_masks, dev_batches_w_lens, dev_batches_sent_maxlen, dev_batches_sent_num)] best_dev = self.eval_accuracy(preds, dev_batches_y) mse_dev = self.eval_accuracy_mse(preds, dev_batches_y) say("\tdev mse = %.4f\taccuracy=%.4f\tbest=%.4f\n" % ( mse_dev, best_dev, best_dev )) preds = [eval_acc(x, u, wm, wl, sm, sn) for x, u, wm, wl, sm, sn in zip( test_batches_x, test_batches_u, test_batches_w_masks, test_batches_w_lens, test_batches_sent_maxlen, test_batches_sent_num)] nowf_test = self.eval_accuracy(preds, test_batches_y) mse_test = self.eval_accuracy_mse(preds, test_batches_y) say("\tdev mse = %.4f\ttest accuracy=%.4f\n" % ( mse_test, nowf_test )) test_a = 0.0 for epoch in xrange(args.max_epochs): unchanged += 1 if unchanged > 20: return train_loss = 0.0 random.shuffle(perm) batches_x, batches_y, batches_u, batches_w_masks, batches_w_lens, batches_sent_maxlen, batches_sent_num = create_batches_doc( perm, trainx, trainy, trainu, batch_size) N = len(batches_x) for i in xrange(N): self.dropout.set_value(dropout_prob) if (i + 1) % 100 == 0: sys.stdout.write("\r%d" % i) sys.stdout.flush() x = batches_x[i] y = batches_y[i] va, grad_norm = train_model( x, y, batches_u[i], batches_w_masks[i], batches_w_lens[i], batches_sent_maxlen[i], batches_sent_num[i]) train_loss += va # debug if math.isnan(va): print() print(i - 1, i) print(x) print(y) #print(batches_w_masks[i]) #print(batches_w_lens[i]) print(batches_sent_maxlen[i]) print(batches_sent_num[i]) return if (i == N - 1) or (eval_period > 0 and (i + 1) % eval_period == 0): self.dropout.set_value(0.0) say("\n") say("Epoch %.3f\tloss=%.4f\t|g|=%s [%.2fm]\n" % ( epoch + (i + 1) / (N + 0.0), train_loss / (i + 1), float(grad_norm), (time.time() - start_time) / 60.0 )) say(str(["%.2f" % np.linalg.norm(x.get_value(borrow=True)) for x in self.params]) + "\n") if dev: preds = [eval_acc(x, u, wm, wl, sm, sn) for x, u, wm, wl, sm, sn in zip( dev_batches_x, dev_batches_u, dev_batches_w_masks, dev_batches_w_lens, dev_batches_sent_maxlen, dev_batches_sent_num)] nowf_dev = self.eval_accuracy(preds, dev_batches_y) if nowf_dev > best_dev: unchanged = 0 best_dev = nowf_dev if args.save: self.save_model(args.save, args) say("\tdev accuracy=%.4f\tbest=%.4f\n" % ( nowf_dev, best_dev )) say("\ttest current_accuracy=%.4f\n" % ( test_a )) if args.test and nowf_dev == best_dev: preds = [eval_acc(x, u, wm, wl, sm, sn) for x, u, wm, wl, sm, sn in zip( test_batches_x, test_batches_u, test_batches_w_masks, test_batches_w_lens, test_batches_sent_maxlen, test_batches_sent_num)] nowf_test = self.eval_accuracy( preds, test_batches_y) say("\ttest accuracy=%.4f\n" % ( nowf_test, )) test_a = nowf_test if best_dev > nowf_dev + 0.5: return self.dropout.set_value(dropout_prob) start_time = time.time()
def train(self, train, dev, test, rationale_data): args = self.args dropout = self.dropout padding_id = self.embedding_layer.vocab_map["<padding>"] if dev is not None: dev_batches_x, dev_batches_y = myio.create_batches( dev[0], dev[1], args.batch, padding_id ) if test is not None: test_batches_x, test_batches_y = myio.create_batches( test[0], test[1], args.batch, padding_id ) if rationale_data is not None: valid_batches_x, valid_batches_y = myio.create_batches( [ u["xids"] for u in rationale_data ], [ u["y"] for u in rationale_data ], args.batch, padding_id, sort = False ) start_time = time.time() train_batches_x, train_batches_y = myio.create_batches( train[0], train[1], args.batch, padding_id ) say("{:.2f}s to create training batches\n\n".format( time.time()-start_time )) updates_e, lr_e, gnorm_e = create_optimization_updates( cost = self.generator.cost_e, params = self.encoder.params, method = args.learning, lr = args.learning_rate )[:3] updates_g, lr_g, gnorm_g = create_optimization_updates( cost = self.generator.cost, params = self.generator.params, method = args.learning, lr = args.learning_rate )[:3] sample_generator = theano.function( inputs = [ self.x ], outputs = self.z_pred, #updates = self.generator.sample_updates #allow_input_downcast = True ) get_loss_and_pred = theano.function( inputs = [ self.x, self.z, self.y ], outputs = [ self.generator.loss_vec, self.encoder.preds ] ) eval_generator = theano.function( inputs = [ self.x, self.y ], outputs = [ self.z, self.generator.obj, self.generator.loss, self.encoder.pred_diff ], givens = { self.z : self.generator.z_pred }, #updates = self.generator.sample_updates, #no_default_updates = True ) train_generator = theano.function( inputs = [ self.x, self.y ], outputs = [ self.generator.obj, self.generator.loss, \ self.generator.sparsity_cost, self.z, gnorm_g, gnorm_e ], givens = { self.z : self.generator.z_pred }, #updates = updates_g, updates = updates_g.items() + updates_e.items() #+ self.generator.sample_updates, #no_default_updates = True ) eval_period = args.eval_period unchanged = 0 best_dev = 1e+2 best_dev_e = 1e+2 dropout_prob = np.float64(args.dropout).astype(theano.config.floatX) for epoch in xrange(args.max_epochs): unchanged += 1 if unchanged > 10: return train_batches_x, train_batches_y = myio.create_batches( train[0], train[1], args.batch, padding_id ) processed = 0 train_cost = 0.0 train_loss = 0.0 train_sparsity_cost = 0.0 p1 = 0.0 start_time = time.time() N = len(train_batches_x) for i in xrange(N): if (i+1) % 100 == 0: say("\r{}/{} ".format(i+1,N)) bx, by = train_batches_x[i], train_batches_y[i] mask = bx != padding_id cost, loss, sparsity_cost, bz, gl2_g, gl2_e = train_generator(bx, by) k = len(by) processed += k train_cost += cost train_loss += loss train_sparsity_cost += sparsity_cost p1 += np.sum(bz*mask) / (np.sum(mask)+1e-8) if (i == N-1) or (eval_period > 0 and processed/eval_period > (processed-k)/eval_period): say("\n") say(("Generator Epoch {:.2f} costg={:.4f} scost={:.4f} lossg={:.4f} " + "p[1]={:.2f} |g|={:.4f} {:.4f}\t[{:.2f}m / {:.2f}m]\n").format( epoch+(i+1.0)/N, train_cost / (i+1), train_sparsity_cost / (i+1), train_loss / (i+1), p1 / (i+1), float(gl2_g), float(gl2_e), (time.time()-start_time)/60.0, (time.time()-start_time)/60.0/(i+1)*N )) say("\t"+str([ "{:.1f}".format(np.linalg.norm(x.get_value(borrow=True))) \ for x in self.encoder.params ])+"\n") say("\t"+str([ "{:.1f}".format(np.linalg.norm(x.get_value(borrow=True))) \ for x in self.generator.params ])+"\n") if dev: self.dropout.set_value(0.0) dev_obj, dev_loss, dev_diff, dev_p1 = self.evaluate_data( dev_batches_x, dev_batches_y, eval_generator, sampling=True) if dev_obj < best_dev: best_dev = dev_obj unchanged = 0 if args.dump and rationale_data: self.dump_rationales(args.dump, valid_batches_x, valid_batches_y, get_loss_and_pred, sample_generator) if args.save_model: self.save_model(args.save_model, args) say(("\tsampling devg={:.4f} mseg={:.4f} avg_diffg={:.4f}" + " p[1]g={:.2f} best_dev={:.4f}\n").format( dev_obj, dev_loss, dev_diff, dev_p1, best_dev )) if rationale_data is not None: r_mse, r_p1, r_prec1, r_prec2 = self.evaluate_rationale( rationale_data, valid_batches_x, valid_batches_y, eval_generator) say(("\trationale mser={:.4f} p[1]r={:.2f} prec1={:.4f}" + " prec2={:.4f}\n").format( r_mse, r_p1, r_prec1, r_prec2 )) self.dropout.set_value(dropout_prob)
def train(self, train, dev, test, rationale_data): args = self.args dropout = self.dropout padding_id = self.embedding_layer.vocab_map["<padding>"] if dev is not None: dev_batches_x, dev_batches_y = myio.create_batches( dev[0], dev[1], args.batch, padding_id) if test is not None: test_batches_x, test_batches_y = myio.create_batches( test[0], test[1], args.batch, padding_id) if rationale_data is not None: valid_batches_x, valid_batches_y = myio.create_batches( [u["xids"] for u in rationale_data], [u["y"] for u in rationale_data], args.batch, padding_id, sort=False) start_time = time.time() train_batches_x, train_batches_y = myio.create_batches( train[0], train[1], args.batch, padding_id) say("{:.2f}s to create training batches\n\n".format(time.time() - start_time)) updates_e, lr_e, gnorm_e = create_optimization_updates( cost=self.generator.cost_e, params=self.encoder.params, method=args.learning, lr=args.learning_rate)[:3] updates_g, lr_g, gnorm_g = create_optimization_updates( cost=self.generator.cost, params=self.generator.params, method=args.learning, lr=args.learning_rate)[:3] sample_generator = theano.function( inputs=[self.x], outputs=self.z_pred, #updates = self.generator.sample_updates #allow_input_downcast = True ) get_loss_and_pred = theano.function( inputs=[self.x, self.z, self.y], outputs=[self.generator.loss_vec, self.encoder.preds]) eval_generator = theano.function( inputs=[self.x, self.y], outputs=[ self.z, self.generator.obj, self.generator.loss, self.encoder.pred_diff ], givens={self.z: self.generator.z_pred}, #updates = self.generator.sample_updates, #no_default_updates = True ) train_generator = theano.function( inputs = [ self.x, self.y ], outputs = [ self.generator.obj, self.generator.loss, \ self.generator.sparsity_cost, self.z, gnorm_g, gnorm_e ], givens = { self.z : self.generator.z_pred }, #updates = updates_g, updates = updates_g.items() | updates_e.items() #+ self.generator.sample_updates, #no_default_updates = True ) eval_period = args.eval_period unchanged = 0 best_dev = 1e+2 best_dev_e = 1e+2 dropout_prob = np.float64(args.dropout).astype(theano.config.floatX) for epoch in range(args.max_epochs): unchanged += 1 if unchanged > 10: return train_batches_x, train_batches_y = myio.create_batches( train[0], train[1], args.batch, padding_id) processed = 0 train_cost = 0.0 train_loss = 0.0 train_sparsity_cost = 0.0 p1 = 0.0 start_time = time.time() N = len(train_batches_x) for i in range(N): if (i + 1) % 100 == 0: say("\r{}/{} ".format(i + 1, N)) bx, by = train_batches_x[i], train_batches_y[i] mask = bx != padding_id cost, loss, sparsity_cost, bz, gl2_g, gl2_e = train_generator( bx, by) k = len(by) processed += k train_cost += cost train_loss += loss train_sparsity_cost += sparsity_cost p1 += np.sum(bz * mask) / (np.sum(mask) + 1e-8) if (i == N - 1) or (eval_period > 0 and processed / eval_period > (processed - k) / eval_period): say("\n") say(( "Generator Epoch {:.2f} costg={:.4f} scost={:.4f} lossg={:.4f} " + "p[1]={:.2f} |g|={:.4f} {:.4f}\t[{:.2f}m / {:.2f}m]\n" ).format(epoch + (i + 1.0) / N, train_cost / (i + 1), train_sparsity_cost / (i + 1), train_loss / (i + 1), p1 / (i + 1), float(gl2_g), float(gl2_e), (time.time() - start_time) / 60.0, (time.time() - start_time) / 60.0 / (i + 1) * N)) say("\t"+str([ "{:.1f}".format(np.linalg.norm(x.get_value(borrow=True))) \ for x in self.encoder.params ])+"\n") say("\t"+str([ "{:.1f}".format(np.linalg.norm(x.get_value(borrow=True))) \ for x in self.generator.params ])+"\n") if dev: self.dropout.set_value(0.0) dev_obj, dev_loss, dev_diff, dev_p1 = self.evaluate_data( dev_batches_x, dev_batches_y, eval_generator, sampling=True) if dev_obj < best_dev: best_dev = dev_obj unchanged = 0 if args.dump and rationale_data: self.dump_rationales(args.dump, valid_batches_x, valid_batches_y, get_loss_and_pred, sample_generator) if args.save_model: self.save_model(args.save_model, args) say(( "\tsampling devg={:.4f} mseg={:.4f} avg_diffg={:.4f}" + " p[1]g={:.2f} best_dev={:.4f}\n").format( dev_obj, dev_loss, dev_diff, dev_p1, best_dev)) if rationale_data is not None: r_mse, r_p1, r_prec1, r_prec2 = self.evaluate_rationale( rationale_data, valid_batches_x, valid_batches_y, eval_generator) say(( "\trationale mser={:.4f} p[1]r={:.2f} prec1={:.4f}" + " prec2={:.4f}\n").format( r_mse, r_p1, r_prec1, r_prec2)) self.dropout.set_value(dropout_prob)
def train(self, args, train, dev, test=None): embedding_layer = self.layers[0] dropout_prob = np.float64(args["dropout"]).astype(theano.config.floatX) batch_size = args["batch_size"] unroll_size = args["unroll_size"] train = create_batches(train, embedding_layer.map_to_ids, batch_size) dev = create_batches(dev, embedding_layer.map_to_ids, batch_size) if test is not None: test = create_batches(test, embedding_layer.map_to_ids, batch_size) cost = T.sum(self.nll) / self.idxs.shape[1] updates, lr, gnorm = create_optimization_updates( cost = cost, params = self.params, lr = args["learning_rate"], beta1 = args["beta1"], beta2 = args["beta2"], rho = args["rho"], momentum = args["momentum"], gamma = args["gamma"], eps = args["eps"], method = args["learning"] )[:3] #if args["learning"] == "adadelta": # lr.set_value(args["learning_rate"]) train_func = theano.function( inputs = [ self.idxs, self.idys, self.init_state ], outputs = [cost, self.last_state, gnorm ], updates = updates ) eval_func = theano.function( inputs = [ self.idxs, self.idys, self.init_state ], outputs = [self.nll, self.last_state ] ) N = (len(train[0])-1)/unroll_size + 1 say(" train: {} tokens, {} mini-batches\n".format( len(train[0].ravel()), N )) say(" dev: {} tokens\n".format(len(dev[0].ravel()))) say("\tp_norm: {}\n".format( self.get_pnorm_stat() )) decay_lr = args["decay_lr"] and args["learning"].lower() != "adadelta" and \ args["learning"].lower() != "adagrad" lr_0 = args["learning_rate"] iter_cnt = 0 unchanged = 0 best_dev = 1e+10 start_time = 0 max_epoch = args["max_epoch"] for epoch in xrange(max_epoch): if unchanged > 5: break start_time = time.time() prev_state = np.zeros((batch_size, self.n_d*2), dtype=theano.config.floatX) train_loss = 0.0 for i in xrange(N): # get current batch x = train[0][i*unroll_size:(i+1)*unroll_size] y = train[1][i*unroll_size:(i+1)*unroll_size] iter_cnt += 1 if decay_lr: lr.set_value(np.float32(lr_0/iter_cnt**0.5)) cur_loss, prev_state, grad_norm = train_func(x, y, prev_state) train_loss += cur_loss/len(x) if math.isnan(cur_loss) or math.isnan(grad_norm): say("\nNaN !!\n") return if i % 10 == 0: say("\r{}".format(i)) if i == N-1: self.dropout.set_value(0.0) dev_preds = self.evaluate(eval_func, dev, batch_size, unroll_size) dev_loss = evaluate_average( predictions = dev_preds, masks = None ) dev_ppl = np.exp(dev_loss) self.dropout.set_value(dropout_prob) say("\r\n") say( ( "Epoch={} lr={:.3f} train_loss={:.3f} train_ppl={:.1f} " \ +"dev_loss={:.3f} dev_ppl={:.1f}\t|g|={:.3f}\t[{:.1f}m]\n" ).format( epoch, float(lr.get_value(borrow=True)), train_loss/N, np.exp(train_loss/N), dev_loss, dev_ppl, float(grad_norm), (time.time()-start_time)/60.0 )) say("\tp_norm: {}\n".format( self.get_pnorm_stat() )) # halve the learning rate #if args["learning"] == "sgd" and dev_ppl > best_dev-1: # lr.set_value(np.max([lr.get_value()/2.0, np.float32(0.0001)])) if dev_ppl < best_dev: best_dev = dev_ppl if test is None: continue self.dropout.set_value(0.0) test_preds = self.evaluate(eval_func, test, batch_size, unroll_size) test_loss = evaluate_average( predictions = test_preds, masks = None ) test_ppl = np.exp(test_loss) self.dropout.set_value(dropout_prob) say("\tbest_dev={:.1f} test_loss={:.3f} test_ppl={:.1f}\n".format( best_dev, test_loss, test_ppl)) if best_dev > 200: unchanged += 1 say("\n")
def train(self, train, dev, test): args = self.args dropout = self.dropout padding_id = self.embedding_layer.vocab_map["<padding>"] if dev is not None: dev_batches_x, dev_batches_y = myio.create_batches( dev[0], dev[1], args.batch, padding_id ) if test is not None: test_batches_x, test_batches_y = myio.create_batches( test[0], test[1], args.batch, padding_id ) start_time = time.time() train_batches_x, train_batches_y = myio.create_batches( train[0], train[1], args.batch, padding_id ) say("{:.2f}s to create training batches\n\n".format( time.time()-start_time )) updates_e, lr_e, gnorm_e = create_optimization_updates( cost = self.encoder.cost_e, params = self.encoder.params, method = args.learning, lr = args.learning_rate )[:3] updates_g, lr_g, gnorm_g = create_optimization_updates( cost = self.encoder.cost_g, params = self.generator.params, method = args.learning, lr = args.learning_rate )[:3] sample_generator = theano.function( inputs = [ self.x ], outputs = self.z ) get_loss_and_pred = theano.function( inputs = [ self.x, self.y ], outputs = [ self.encoder.loss_vec, self.encoder.preds, self.z ] ) train_generator = theano.function( inputs = [ self.x, self.y ], outputs = [ self.encoder.obj, self.encoder.loss, \ self.encoder.sparsity_cost, self.z, gnorm_e, gnorm_g ], updates = updates_e.items() + updates_g.items(), ) eval_func = theano.function( inputs = [ self.x, self.y ], outputs = [ self.z, self.encoder.obj, self.true_pos, self.tot_pos, self.tot_true ] ) eval_period = args.eval_period unchanged = 0 best_dev = 1e+2 best_dev_e = 1e+2 last_train_avg_cost = None last_dev_avg_cost = None tolerance = 0.10 + 1e-3 dropout_prob = np.float64(args.dropout).astype(theano.config.floatX) for epoch in xrange(args.max_epochs): unchanged += 1 if unchanged > 50: return train_batches_x, train_batches_y = myio.create_batches( train[0], train[1], args.batch, padding_id ) more = True if args.decay_lr: param_bak = [ p.get_value(borrow=False) for p in self.params ] while more: processed = 0 train_cost = 0.0 train_loss = 0.0 train_sparsity_cost = 0.0 p1 = 0.0 start_time = time.time() N = len(train_batches_x) for i in xrange(N): if (i+1) % 100 == 0: say("\r{}/{} {:.2f} ".format(i+1,N,p1/(i+1))) bx, by = train_batches_x[i], train_batches_y[i] mask = bx != padding_id cost, loss, sparsity_cost, bz, gl2_e, gl2_g = train_generator(bx, by) k = len(by) processed += k train_cost += cost train_loss += loss train_sparsity_cost += sparsity_cost p1 += np.sum(bz*mask) / (np.sum(mask)+1e-8) cur_train_avg_cost = train_cost / N if dev: self.dropout.set_value(0.0) dev_obj, dev_prec, dev_recall, dev_f1, dev_p1 = self.evaluate_data( dev_batches_x, dev_batches_y, eval_func) self.dropout.set_value(dropout_prob) cur_dev_avg_cost = dev_obj more = False if args.decay_lr and last_train_avg_cost is not None: if cur_train_avg_cost > last_train_avg_cost*(1+tolerance): more = True say("\nTrain cost {} --> {}\n".format( last_train_avg_cost, cur_train_avg_cost )) if dev and cur_dev_avg_cost > last_dev_avg_cost*(1+tolerance): more = True say("\nDev cost {} --> {}\n".format( last_dev_avg_cost, cur_dev_avg_cost )) if more: lr_val = lr_g.get_value()*0.5 lr_val = np.float64(lr_val).astype(theano.config.floatX) lr_g.set_value(lr_val) lr_e.set_value(lr_val) say("Decrease learning rate to {}\n".format(float(lr_val))) for p, v in zip(self.params, param_bak): p.set_value(v) continue last_train_avg_cost = cur_train_avg_cost if dev: last_dev_avg_cost = cur_dev_avg_cost say("\n") say(("Generator Epoch {:.2f} costg={:.4f} scost={:.4f} lossg={:.4f} " + "p[1]={:.3f} |g|={:.4f} {:.4f}\t[{:.2f}m / {:.2f}m]\n").format( epoch+(i+1.0)/N, train_cost / N, train_sparsity_cost / N, train_loss / N, p1 / N, float(gl2_e), float(gl2_g), (time.time()-start_time)/60.0, (time.time()-start_time)/60.0/(i+1)*N )) say("\t"+str([ "{:.2f}".format(np.linalg.norm(x.get_value(borrow=True))) \ for x in self.encoder.params ])+"\n") say("\t"+str([ "{:.2f}".format(np.linalg.norm(x.get_value(borrow=True))) \ for x in self.generator.params ])+"\n") if dev: if dev_obj < best_dev: best_dev = dev_obj unchanged = 0 if args.dump and test: self.dump_rationales(args.dump, test_batches_x, test_batches_y, get_loss_and_pred, sample_generator) say(("\tdevg={:.4f} f1g={:.4f} preg={:.4f} recg={:.4f}" + " p[1]g={:.3f} best_dev={:.4f}\n").format( dev_obj, dev_f1, dev_prec, dev_recall, dev_p1, best_dev )) if test is not None: self.dropout.set_value(0.0) test_obj, test_prec, test_recall, test_f1, test_p1 = self.evaluate_data( test_batches_x, test_batches_y, eval_func) self.dropout.set_value(dropout_prob) say(("\ttestt={:.4f} f1t={:.4f} pret={:.4f} rect={:.4f}" + " p[1]t={:.3f}\n").format( test_obj, test_f1, test_prec, test_recall, test_p1 ))
def train(self, source_train, target_train, source_ul, target_ul, dev, test): args = self.args n_domain = 2 padding_id = self.padding_id start_time = time.time() if source_train is not None: s_train_batches, source_train = io_util.create_batches( source_train, args.batch, padding_id) for b in s_train_batches: b.append(self.get_domain_ids(domain_id=0, n_domain=n_domain, batch=len(b[1]))) if target_train is not None: t_train_batches, target_train = io_util.create_batches( target_train, args.batch, padding_id) for b in t_train_batches: b.append(self.get_domain_ids(domain_id=1, n_domain=n_domain, batch=len(b[1]))) if dev is not None: dev_batches, dev = io_util.create_batches( dev, args.batch, padding_id ) for b in dev_batches: b.append(self.get_domain_ids(domain_id=0, n_domain=n_domain, batch=len(b[1]))) tot = 0 for b in dev_batches: tot += len(b[0].T) print "dev size:", tot, len(dev) if test is not None: test_batches, test = io_util.create_batches( test, args.batch, padding_id ) for b in test_batches: b.append(self.get_domain_ids(domain_id=1, n_domain=n_domain, batch=len(b[1]))) tot = 0 for b in test_batches: tot += len(b[0].T) print "test size:", tot, len(test) print 'load source unlabeled data' s_ul_batches, source_ul = io_util.create_batches( source_ul, args.batch, padding_id, label=False) for b in s_ul_batches: b.append(self.get_domain_ids(domain_id=0, n_domain=n_domain, batch=len(b[1]))) print 'load target unlabeled data' t_ul_batches, target_ul = io_util.create_batches( target_ul, args.batch, padding_id, label=False) for b in t_ul_batches: b.append(self.get_domain_ids(domain_id=1, n_domain=n_domain, batch=len(b[1]))) say("{:.2f}s to create training batches\n\n".format( time.time()-start_time )) dom_updates, dom_lr, dom_gnorm = create_optimization_updates( cost = self.dom_cost, params = self.dom_params, method = args.learning, lr = args.learning_rate, gsums = self.dom_accums[0], xsums = self.dom_accums[1], )[:3] other_updates, other_lr, other_gnorm = create_optimization_updates( cost = self.other_cost_except_dom, params = self.other_params_except_dom, method = args.learning, lr = args.learning_rate, gsums = self.other_accums_except_dom[0], xsums = self.other_accums_except_dom[1], )[:3] BNupdates = self.cnn_layer.get_updates() train_func = theano.function( inputs = [ self.s_idxs, self.t_idxs, self.s_idys, self.t_idys, self.s_gold_rels, self.t_gold_rels, \ self.s_dom_ids, self.t_dom_ids, self.s_has_lab, self.t_has_lab ], outputs = [ self.dom_cost, self.other_cost_except_dom, dom_gnorm, other_gnorm, \ self.s_lab_loss, self.t_lab_loss, self.s_rel_loss, self.t_rel_loss, \ self.s_dom_loss, self.t_dom_loss, self.s_adv_loss, self.t_adv_loss, self.trans_reg, \ self.s_recon_loss, self.t_recon_loss ], updates = dom_updates.items() + other_updates.items() + BNupdates, ) s_get_loss_and_pred = theano.function( inputs = [ self.s_idxs, self.s_idys, self.s_gold_rels, self.s_dom_ids ], outputs = [ self.s_lab_prob, self.s_lab_loss, self.s_rel_loss, self.s_dom_loss, self.s_adv_loss, self.s_recon_loss ] ) t_get_loss_and_pred = theano.function( inputs = [ self.t_idxs, self.t_idys, self.t_gold_rels, self.t_dom_ids ], outputs = [ self.t_lab_prob, self.t_lab_loss, self.t_rel_loss, self.t_dom_loss, self.t_adv_loss, self.t_recon_loss ] ) unchanged = 0 best_dev = 0 dropout_prob = np.float64(args.dropout).astype(theano.config.floatX) s_ul_batch_ptr = 0 t_ul_batch_ptr = 0 s_train_ptr = 0 t_train_ptr = 0 test_ptr = 0 print 'Training' say("\t"+str([ "{:.1f}".format(np.linalg.norm(x.get_value(borrow=True))) \ for x in self.params ])+"\n") for epoch in xrange(args.epochs): unchanged += 1 if unchanged > 100: break s_avg_lab_loss, s_avg_rel_loss, s_avg_dom_loss, s_avg_adv_loss, s_avg_recon_loss = 0.0, 0.0, 0.0, 0.0, 0.0 t_avg_lab_loss, t_avg_rel_loss, t_avg_dom_loss, t_avg_adv_loss, t_avg_recon_loss = 0.0, 0.0, 0.0, 0.0, 0.0 avg_dom_cost, avg_other_cost, dom_g, other_g, avg_trans_reg = 0.0, 0.0, 0.0, 0.0, 0.0 start_time = time.time() source_k = self.source_k if source_train is not None: N = len(s_train_batches) * source_k else: raise Exception(), "no source training data?" N_s_ul = len(s_ul_batches) N_t_ul = len(t_ul_batches) n_s_lab, n_t_lab, n_s_ul, n_t_ul = 0, 0, 0, 0 for t in xrange(N): progress = epoch + (t+0.0)/N rho_t = 2.0 / (1.0 + np.exp(-0.5*progress)) - 1.0 rho_t = np.float64(rho_t * args.rho).astype(theano.config.floatX) self.rho.set_value(rho_t) lr_t = args.learning_rate / (1.0 + 0.5*progress) ** 0.75 lr_t = np.float64(lr_t).astype(theano.config.floatX) other_lr.set_value(lr_t) s_task = t % source_k if s_task == 0 and source_train is not None: s_bx, s_by, s_brel, s_bid = s_train_batches[s_train_ptr] s_has_lab = 1 s_train_ptr = (s_train_ptr+1)%len(s_train_batches) n_s_lab += 1 else: s_bx, s_by, s_brel, s_bid = s_ul_batches[s_ul_batch_ptr] s_has_lab = 0 s_ul_batch_ptr = (s_ul_batch_ptr+1)%N_s_ul n_s_ul += 1 t_bx, t_by, t_brel, t_bid = t_ul_batches[t_ul_batch_ptr] t_has_lab = 0 t_ul_batch_ptr = (t_ul_batch_ptr+1)%N_t_ul n_t_ul += 1 dom_cost, other_cost, dom_g, other_g, \ s_lab_loss, t_lab_loss, s_rel_loss, t_rel_loss, \ s_dom_loss, t_dom_loss, s_adv_loss, t_adv_loss, trans_reg, \ s_recon_loss, t_recon_loss = train_func( \ s_bx, t_bx, s_by, t_by, s_brel, t_brel, s_bid, t_bid, s_has_lab, t_has_lab) avg_dom_cost += dom_cost avg_other_cost += other_cost avg_trans_reg += trans_reg if s_has_lab: s_avg_lab_loss += s_lab_loss if t_has_lab: t_avg_lab_loss += t_lab_loss s_avg_rel_loss += s_rel_loss t_avg_rel_loss += t_rel_loss s_avg_dom_loss += s_dom_loss t_avg_dom_loss += t_dom_loss s_avg_adv_loss += s_adv_loss t_avg_adv_loss += t_adv_loss s_avg_recon_loss += s_recon_loss t_avg_recon_loss += t_recon_loss say("\r{}/{}/{} {}/{}/{} {}/{}/{} {}/{}/{}/{} ".format(n_s_lab,s_train_ptr,N, \ n_t_lab,t_train_ptr,N, \ n_s_ul,s_ul_batch_ptr,N_s_ul, \ n_t_ul,t_ul_batch_ptr,test_ptr,N_t_ul)) say(("Epoch {:.2f} [{:.2f}m]\n").format( epoch, (time.time()-start_time)/60.0, )) say("Source:\t") if source_train is not None: say(("lab_loss={:.4f} ").format(s_avg_lab_loss / n_s_lab,)) say(("rel_loss={:.4f} dom_loss={:.4f} adv_loss={:.4f} recon_loss={:.4f}\n").format( s_avg_rel_loss / N, s_avg_dom_loss / N, s_avg_adv_loss / N, s_avg_recon_loss / N, )) say("Target:\t") if target_train is not None: say(("lab_loss={:.4f} ").format(t_avg_lab_loss / n_t_lab,)) say(("rel_loss={:.4f} dom_loss={:.4f} adv_loss={:.4f} recon_loss={:.4f}\n").format( t_avg_rel_loss / N, t_avg_dom_loss / N, t_avg_adv_loss / N, t_avg_recon_loss / N, )) say(("Domain cost={:.4f} |g|={:.4f} Other cost={:.4f} |g|={:.4f} trans_reg={:.4f}\n").format( avg_dom_cost / N, float(dom_g), avg_other_cost / N, float(other_g), avg_trans_reg / N, )) say("\t"+str([ "{:.1f}".format(np.linalg.norm(x.get_value(borrow=True))) \ for x in self.params ])+"\n") if dev: self.dropout.set_value(0.0) self.cnn_layer.set_runmode(1) dev_lab_loss, dev_rel_loss, dev_dom_loss, dev_adv_loss, dev_recon_loss, dev_acc, dev_f1 = self.evaluate_data(dev_batches, s_get_loss_and_pred) self.dropout.set_value(dropout_prob) self.cnn_layer.set_runmode(0) if dev_acc > best_dev: best_dev = dev_acc unchanged = 0 say(("\tdev_lab_loss={:.4f} dev_rel_loss={:.4f} dom_loss={:.4f} adv_loss={:.4f} recon_loss={:.4f} dev_acc={:.4f} dev_f1={}" + " best_dev={:.4f}\n").format( dev_lab_loss, dev_rel_loss, dev_dom_loss, dev_adv_loss, dev_recon_loss, dev_acc, " ".join(['{:.4f}'.format(x) for x in dev_f1]), best_dev, )) if test: self.dropout.set_value(0.0) self.cnn_layer.set_runmode(1) test_lab_loss, test_rel_loss, test_dom_loss, test_adv_loss, test_recon_loss, test_acc, test_f1 = self.evaluate_data(test_batches, t_get_loss_and_pred) self.dropout.set_value(dropout_prob) self.cnn_layer.set_runmode(0) say(("\ttest_lab_loss={:.4f} test_rel_loss={:.4f} dom_loss={:.4f} adv_loss={:.4f} recon_loss={:.4f} test_acc={:.4f} test_f1={}\n").format( test_lab_loss, test_rel_loss, test_dom_loss, test_adv_loss, test_recon_loss, test_acc, " ".join(['{:.4f}'.format(x) for x in test_f1]), ))
def train(self, ids_corpus, train, dev=None, test=None): dropout_prob = np.float64(args.dropout).astype(theano.config.floatX) batch_size = args.batch_size padding_id = self.padding_id #train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id) updates, lr, gnorm = create_optimization_updates( cost=self.cost, params=self.params, lr=args.learning_rate, method=args.learning)[:3] train_func = theano.function(inputs=[self.idts, self.idbs, self.idps], outputs=[self.cost, self.loss, gnorm], updates=updates) eval_func = theano.function(inputs=[self.idts, self.idbs], outputs=self.scores, on_unused_input='ignore') say("\tp_norm: {}\n".format(self.get_pnorm_stat())) result_table = PrettyTable( ["Epoch", "dev MAP", "dev MRR", "dev P@1", "dev P@5"] + ["tst MAP", "tst MRR", "tst P@1", "tst P@5"]) unchanged = 0 best_dev = -1 dev_MAP = dev_MRR = dev_P1 = dev_P5 = 0 test_MAP = test_MRR = test_P1 = test_P5 = 0 start_time = 0 max_epoch = args.max_epoch for epoch in xrange(max_epoch): unchanged += 1 if unchanged > 15: break start_time = time.time() train = myio.read_annotations(args.train) train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id, pad_left=not args.average) N = len(train_batches) train_loss = 0.0 train_cost = 0.0 for i in xrange(N): # get current batch idts, idbs, idps = train_batches[i] cur_cost, cur_loss, grad_norm = train_func(idts, idbs, idps) train_loss += cur_loss train_cost += cur_cost if i % 10 == 0: say("\r{}/{}".format(i, N)) if i == N - 1: self.dropout.set_value(0.0) if dev is not None: dev_MAP, dev_MRR, dev_P1, dev_P5 = self.evaluate( dev, eval_func) if test is not None: test_MAP, test_MRR, test_P1, test_P5 = self.evaluate( test, eval_func) if dev_MRR > best_dev: unchanged = 0 best_dev = dev_MRR result_table.add_row([epoch] + [ "%.2f" % x for x in [dev_MAP, dev_MRR, dev_P1, dev_P5] + [test_MAP, test_MRR, test_P1, test_P5] ]) if args.save_model: self.save_model(args.save_model) dropout_p = np.float64(args.dropout).astype( theano.config.floatX) self.dropout.set_value(dropout_p) say("\r\n\n") say( ( "Epoch {}\tcost={:.3f}\tloss={:.3f}" \ +"\tMRR={:.2f},{:.2f}\t|g|={:.3f}\t[{:.3f}m]\n" ).format( epoch, train_cost / (i+1), train_loss / (i+1), dev_MRR, best_dev, float(grad_norm), (time.time()-start_time)/60.0 )) say("\tp_norm: {}\n".format(self.get_pnorm_stat())) say("\n") say("{}".format(result_table)) say("\n")
def train(self, ids_corpus, train, dev=None, test=None): dropout_prob = np.float64(args.dropout).astype(theano.config.floatX) batch_size = args.batch_size padding_id = self.padding_id #train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id) if dev is not None: dev, dev_raw = dev if test is not None: test, test_raw = test if args.joint: updates_e, lr_e, gnorm_e = create_optimization_updates( cost = self.encoder.cost_e, #self.encoder.cost, params = self.encoder.params, lr = args.learning_rate*0.1, method = args.learning )[:3] else: updates_e = {} updates_g, lr_g, gnorm_g = create_optimization_updates( cost = self.encoder.cost_g, params = self.generator.params, lr = args.learning_rate, method = args.learning )[:3] train_func = theano.function( inputs = [ self.x, self.triples, self.pairs ], outputs = [ self.encoder.obj, self.encoder.loss, \ self.encoder.sparsity_cost, self.generator.p1, gnorm_g ], updates = updates_g.items() + updates_e.items() + self.generator.sample_updates, #no_default_updates = True, on_unused_input= "ignore" ) eval_func = theano.function( inputs = [ self.x ], outputs = self.encoder.scores ) eval_func2 = theano.function( inputs = [ self.x ], outputs = [ self.encoder.scores_z, self.generator.p1, self.z ], updates = self.generator.sample_updates, #no_default_updates = True ) say("\tp_norm: {}\n".format( self.get_pnorm_stat(self.encoder.params) )) say("\tp_norm: {}\n".format( self.get_pnorm_stat(self.generator.params) )) result_table = PrettyTable(["Epoch", "dev MAP", "dev MRR", "dev P@1", "dev P@5"] + ["tst MAP", "tst MRR", "tst P@1", "tst P@5"]) last_train_avg_cost = None tolerance = 0.5 + 1e-3 unchanged = 0 best_dev = -1 dev_MAP = dev_MRR = dev_P1 = dev_P5 = 0 test_MAP = test_MRR = test_P1 = test_P5 = 0 start_time = 0 max_epoch = args.max_epoch for epoch in xrange(max_epoch): unchanged += 1 if unchanged > 20: break start_time = time.time() train = myio.read_annotations(args.train) train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id, pad_left=not args.average, merge=args.merge) N =len(train_batches) more = True param_bak = [ p.get_value(borrow=False) for p in self.params ] while more: train_loss = 0.0 train_cost = 0.0 train_scost = 0.0 train_p1 = 0.0 for i in xrange(N): # get current batch idts, triples, pairs = train_batches[i] cur_cost, cur_loss, cur_scost, cur_p1, gnormg = train_func(idts, triples, pairs) train_loss += cur_loss train_cost += cur_cost train_scost += cur_scost train_p1 += cur_p1 if i % 10 == 0: say("\r{}/{} {:.3f}".format(i,N,train_p1/(i+1))) cur_train_avg_cost = train_cost / N more = False if last_train_avg_cost is not None: if cur_train_avg_cost > last_train_avg_cost*(1+tolerance): more = True say("\nTrain cost {} --> {}\n".format( last_train_avg_cost, cur_train_avg_cost )) if more: lr_val = lr_g.get_value()*0.5 if lr_val < 1e-5: return lr_val = np.float64(lr_val).astype(theano.config.floatX) lr_g.set_value(lr_val) lr_e.set_value(lr_val) say("Decrease learning rate to {}\n".format(float(lr_val))) for p, v in zip(self.params, param_bak): p.set_value(v) continue last_train_avg_cost = cur_train_avg_cost say("\r\n\n") say( ( "Epoch {} cost={:.3f} loss={:.3f} scost={:.3f}" \ +" P[1]={:.3f} |g|={:.3f}\t[{:.3f}m]\n" ).format( epoch, train_cost / N, train_loss / N, train_scost / N, train_p1 / N, float(gnormg), (time.time()-start_time)/60.0 )) say("\tp_norm: {}\n".format( self.get_pnorm_stat(self.encoder.params) )) say("\tp_norm: {}\n".format( self.get_pnorm_stat(self.generator.params) )) self.dropout.set_value(0.0) if dev is not None: full_MAP, full_MRR, full_P1, full_P5 = self.evaluate(dev, eval_func) dev_MAP, dev_MRR, dev_P1, dev_P5, dev_PZ1, dev_PT = self.evaluate_z(dev, dev_raw, ids_corpus, eval_func2) if test is not None: test_MAP, test_MRR, test_P1, test_P5, test_PZ1, test_PT = \ self.evaluate_z(test, test_raw, ids_corpus, eval_func2) if dev_MAP > best_dev: best_dev = dev_MAP unchanged = 0 say("\n") say(" fMAP={:.2f} fMRR={:.2f} fP1={:.2f} fP5={:.2f}\n".format( full_MAP, full_MRR, full_P1, full_P5 )) say("\n") say((" dMAP={:.2f} dMRR={:.2f} dP1={:.2f} dP5={:.2f}" + " dP[1]={:.3f} d%T={:.3f} best_dev={:.2f}\n").format( dev_MAP, dev_MRR, dev_P1, dev_P5, dev_PZ1, dev_PT, best_dev )) result_table.add_row( [ epoch ] + [ "%.2f" % x for x in [ dev_MAP, dev_MRR, dev_P1, dev_P5 ] + [ test_MAP, test_MRR, test_P1, test_P5 ] ] ) if unchanged == 0: say("\n") say((" tMAP={:.2f} tMRR={:.2f} tP1={:.2f} tP5={:.2f}" + " tP[1]={:.3f} t%T={:.3f}\n").format( test_MAP, test_MRR, test_P1, test_P5, test_PZ1, test_PT )) if args.dump_rationale: self.evaluate_z(dev+test, dev_raw+test_raw, ids_corpus, eval_func2, args.dump_rationale) #if args.save_model: # self.save_model(args.save_model) dropout_p = np.float64(args.dropout).astype( theano.config.floatX) self.dropout.set_value(dropout_p) say("\n") say("{}".format(result_table)) say("\n") if train_p1/N <= 1e-4 or train_p1/N+1e-4 >= 1.0: break