def save_model(self, path): args = self.args lst_params = [ ] for i in range(args.depth): lst_params.append(self.layers[i*2].params) with gzip.open(path,"w") as fout: pickle.dump( { "d": args.hidden_dim, "layer_type": args.layer, "args": args, "params": lst_params }, fout, protocol = pickle.HIGHEST_PROTOCOL ) say(" \tmodel saved.\n")
def save_model(self, path): args = self.args lst_params = [] for i in range(args.depth): lst_params.append(self.layers[i * 2].params) with gzip.open(path, "w") as fout: pickle.dump( { "d": args.hidden_dim, "layer_type": args.layer, "args": args, "params": lst_params }, fout, protocol=pickle.HIGHEST_PROTOCOL) say(" \tmodel saved.\n")
def __init__(self, model_path, corpus_path, emb_path): raw_corpus = myio.read_corpus(corpus_path) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d=10, cut_off=1, embs=load_embedding_iterator(emb_path)) weights = myio.create_idf_weights(corpus_path, embedding_layer) say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus))) model = Model(args=None, embedding_layer=embedding_layer, weights=weights) model_data = model.load_model(model_path) model.set_model(model_data) model.dropout.set_value(0.0) say("model initialized\n") score_func = theano.function(inputs=[model.idts, model.idbs], outputs=model.scores, on_unused_input='ignore') self.model = model self.score_func = score_func say("scoring function compiled\n")
def __init__(self, model_path, corpus_path, emb_path): raw_corpus = myio.read_corpus(corpus_path) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d = 10, cut_off = 1, embs = load_embedding_iterator(emb_path) ) weights = myio.create_idf_weights(corpus_path, embedding_layer) say("vocab size={}, corpus size={}\n".format( embedding_layer.n_V, len(raw_corpus) )) model = Model(args=None, embedding_layer=embedding_layer, weights=weights) model_data = model.load_model(model_path) model.set_model(model_data) model.dropout.set_value(0.0) say("model initialized\n") score_func = theano.function( inputs = [ model.idts, model.idbs ], outputs = model.scores, on_unused_input='ignore' ) self.model = model self.score_func = score_func say("scoring function compiled\n")
def main(args): raw_corpus = myio.read_corpus(args.corpus) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d=args.hidden_dim, embs=load_embedding_iterator(args.embeddings) if args.embeddings else None) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, max_len=args.max_seq_len) say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus))) padding_id = embedding_layer.vocab_map["<padding>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.dev: dev_raw = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev_raw, padding_id, pad_left=not args.average, merge=args.merge) if args.test: test_raw = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test_raw, padding_id, pad_left=not args.average, merge=args.merge) if args.train: start_time = time.time() train = myio.read_annotations(args.train) train_batches = myio.create_batches(ids_corpus, train, args.batch_size, padding_id, pad_left=not args.average, merge=args.merge) say("{} to create batches\n".format(time.time() - start_time)) say("{} batches, {} tokens in total, {} triples in total\n".format( len(train_batches), sum(len(x[0].ravel()) for x in train_batches), sum(len(x[1].ravel()) for x in train_batches))) train_batches = None model = Model(args, embedding_layer, weights=weights if args.reweight else None) model.ready() # set parameters using pre-trained network if args.load_pretrain: model.encoder.load_pretrained_parameters(args) model.train(ids_corpus, train, (dev, dev_raw) if args.dev else None, (test, test_raw) if args.test else None)
def main(args): raw_corpus = myio.read_corpus(args.corpus) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d = args.hidden_dim, cut_off = args.cut_off, embs = load_embedding_iterator(args.embeddings) if args.embeddings else None ) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer) say("vocab size={}, corpus size={}\n".format( embedding_layer.n_V, len(raw_corpus) )) padding_id = embedding_layer.vocab_map["<padding>"] bos_id = embedding_layer.vocab_map["<s>"] eos_id = embedding_layer.vocab_map["</s>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.dev: dev = myio.read_annotations(args.dev, K_neg=20, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev, padding_id) if args.test: test = myio.read_annotations(args.test, K_neg=20, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test, padding_id) if args.heldout: with open(args.heldout) as fin: heldout_ids = fin.read().split() heldout_corpus = dict((id, ids_corpus[id]) for id in heldout_ids if id in ids_corpus) train_corpus = dict((id, ids_corpus[id]) for id in ids_corpus if id not in heldout_corpus) heldout = myio.create_batches(heldout_corpus, [ ], args.batch_size, padding_id, bos_id, eos_id, auto_encode=True) heldout = [ myio.create_one_batch(b1, t2, padding_id) for t1, b1, t2 in heldout ] say("heldout examples={}\n".format(len(heldout_corpus))) if args.train: model = Model(args, embedding_layer, weights=weights if args.reweight else None) start_time = time.time() train = myio.read_annotations(args.train) if not args.use_anno: train = [ ] train_batches = myio.create_batches(ids_corpus, train, args.batch_size, model.padding_id, model.bos_id, model.eos_id, auto_encode=True) say("{} to create batches\n".format(time.time()-start_time)) model.ready() model.train( ids_corpus if not args.heldout else train_corpus, train, dev if args.dev else None, test if args.test else None, heldout if args.heldout else None )
def main(args): raw_corpus = myio.read_corpus(args.corpus) print("raw corpus:", args.corpus, "len:", len(raw_corpus)) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d = args.hidden_dim, cut_off = args.cut_off, embs = None # embs = load_embedding_iterator(args.embeddings) if args.embeddings else None ) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, max_len=args.max_seq_len) myio.say("vocab size={}, corpus size={}\n".format( embedding_layer.n_V, len(raw_corpus) )) padding_id = embedding_layer.vocab_map["<padding>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) # # if args.dev: # dev = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1) # dev = myio.create_eval_batches(ids_corpus, dev, padding_id, pad_left = not args.average) # if args.test: # test = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1) # test = myio.create_eval_batches(ids_corpus, test, padding_id, pad_left = not args.average) if args.train: start_time = time.time() train = myio.read_annotations(args.train) print("training data:", args.train, "len:", len(train)) train_batches = myio.create_batches(ids_corpus, train, args.batch_size, padding_id, pad_left = not args.average) myio.say("{:.2f} secs to create {} batches of size {}\n".format( (time.time()-start_time), len(train_batches), args.batch_size)) myio.say("{} batches, {} tokens in total, {} triples in total\n".format( len(train_batches), sum(len(x[0].ravel())+len(x[1].ravel()) for x in train_batches), sum(len(x[2].ravel()) for x in train_batches) )) # train_batches = None model = Model(args, embedding_layer, weights=weights if args.reweight else None) model.ready() # # set parameters using pre-trained network # if args.load_pretrain: # model.load_pretrained_parameters(args) # model.train( ids_corpus, train, dev = None, # dev if args.dev else None, test = None # test if args.test else None )
def main(args): raw_corpus = myio.read_corpus(args.corpus) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d = args.hidden_dim, embs = load_embedding_iterator(args.embeddings) if args.embeddings else None ) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, max_len=args.max_seq_len) say("vocab size={}, corpus size={}\n".format( embedding_layer.n_V, len(raw_corpus) )) padding_id = embedding_layer.vocab_map["<padding>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.dev: dev_raw = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev_raw, padding_id, pad_left=not args.average, merge=args.merge) if args.test: test_raw = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test_raw, padding_id, pad_left=not args.average, merge=args.merge) if args.train: start_time = time.time() train = myio.read_annotations(args.train) train_batches = myio.create_batches(ids_corpus, train, args.batch_size, padding_id, pad_left = not args.average, merge=args.merge) say("{} to create batches\n".format(time.time()-start_time)) say("{} batches, {} tokens in total, {} triples in total\n".format( len(train_batches), sum(len(x[0].ravel()) for x in train_batches), sum(len(x[1].ravel()) for x in train_batches) )) train_batches = None model = Model(args, embedding_layer, weights=weights if args.reweight else None) model.ready() # set parameters using pre-trained network if args.load_pretrain: model.encoder.load_pretrained_parameters(args) model.train( ids_corpus, train, (dev, dev_raw) if args.dev else None, (test, test_raw) if args.test else None )
def __init__(self, model_path, corpus_path, emb_path, session, layer='lstm'): raw_corpus = myio.read_corpus(corpus_path) embedding_layer = create_embedding_layer(n_d=10, embs=load_embedding_iterator( args.embeddings), only_words=False) # weights = myio.create_idf_weights(corpus_path, embedding_layer) # todo say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus))) if layer.lower() == "lstm": from models import LstmQR as Model elif layer.lower() in ["bilstm", "bigru"]: from models import BiRNNQR as Model elif layer.lower() == "cnn": from models import CnnQR as Model elif layer.lower() == "gru": from models import GruQR as Model model = Model(args={"layer": args.layer}, embedding_layer=embedding_layer, weights=None) model.load_n_set_model(model_path, session) say("model initialized\n") self.model = model def score_func(titles, bodies, cur_sess): feed_dict = { self.model.titles_words_ids_placeholder: titles.T, # IT IS TRANSPOSE ;) self.model.bodies_words_ids_placeholder: bodies.T, # IT IS TRANSPOSE ;) self.model.dropout_prob: 0., } _scores = cur_sess.run(self.model.scores, feed_dict) return _scores self.score_func = score_func say("scoring function compiled\n")
def train(self, ids_corpus, train, dev=None, test=None): dropout_prob = np.float64(args.dropout).astype(theano.config.floatX) batch_size = args.batch_size padding_id = self.padding_id #train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id) updates, lr, gnorm = create_optimization_updates( cost=self.cost, params=self.params, lr=args.learning_rate, method=args.learning)[:3] train_func = theano.function(inputs=[self.idts, self.idbs, self.idps], outputs=[self.cost, self.loss, gnorm], updates=updates) eval_func = theano.function(inputs=[self.idts, self.idbs], outputs=self.scores, on_unused_input='ignore') say("\tp_norm: {}\n".format(self.get_pnorm_stat())) result_table = PrettyTable( ["Epoch", "dev MAP", "dev MRR", "dev P@1", "dev P@5"] + ["tst MAP", "tst MRR", "tst P@1", "tst P@5"]) unchanged = 0 best_dev = -1 dev_MAP = dev_MRR = dev_P1 = dev_P5 = 0 test_MAP = test_MRR = test_P1 = test_P5 = 0 start_time = 0 max_epoch = args.max_epoch for epoch in xrange(max_epoch): unchanged += 1 if unchanged > 15: break start_time = time.time() train = myio.read_annotations(args.train) train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id, pad_left=not args.average) N = len(train_batches) train_loss = 0.0 train_cost = 0.0 for i in xrange(N): # get current batch idts, idbs, idps = train_batches[i] cur_cost, cur_loss, grad_norm = train_func(idts, idbs, idps) train_loss += cur_loss train_cost += cur_cost if i % 10 == 0: say("\r{}/{}".format(i, N)) if i == N - 1: self.dropout.set_value(0.0) if dev is not None: dev_MAP, dev_MRR, dev_P1, dev_P5 = self.evaluate( dev, eval_func) if test is not None: test_MAP, test_MRR, test_P1, test_P5 = self.evaluate( test, eval_func) if dev_MRR > best_dev: unchanged = 0 best_dev = dev_MRR result_table.add_row([epoch] + [ "%.2f" % x for x in [dev_MAP, dev_MRR, dev_P1, dev_P5] + [test_MAP, test_MRR, test_P1, test_P5] ]) if args.save_model: self.save_model(args.save_model) dropout_p = np.float64(args.dropout).astype( theano.config.floatX) self.dropout.set_value(dropout_p) say("\r\n\n") say( ( "Epoch {}\tcost={:.3f}\tloss={:.3f}" \ +"\tMRR={:.2f},{:.2f}\t|g|={:.3f}\t[{:.3f}m]\n" ).format( epoch, train_cost / (i+1), train_loss / (i+1), dev_MRR, best_dev, float(grad_norm), (time.time()-start_time)/60.0 )) say("\tp_norm: {}\n".format(self.get_pnorm_stat())) say("\n") say("{}".format(result_table)) say("\n")
def ready(self): args = self.args weights = self.weights # len(source) * batch idxs = self.idxs = T.imatrix() # len(target) * batch idys = self.idys = T.imatrix() idts = idys[:-1] idgs = idys[1:] dropout = self.dropout = theano.shared( np.float64(args.dropout).astype(theano.config.floatX)) embedding_layer = self.embedding_layer activation = get_activation_by_name(args.activation) n_d = self.n_d = args.hidden_dim n_e = self.n_e = embedding_layer.n_d n_V = self.n_V = embedding_layer.n_V if args.layer.lower() == "rcnn": LayerType = RCNN elif args.layer.lower() == "lstm": LayerType = LSTM elif args.layer.lower() == "gru": LayerType = GRU depth = self.depth = args.depth layers = self.layers = [] for i in range(depth * 2): if LayerType != RCNN: feature_layer = LayerType(n_in=n_e if i / 2 == 0 else n_d, n_out=n_d, activation=activation) else: feature_layer = LayerType(n_in=n_e if i / 2 == 0 else n_d, n_out=n_d, activation=activation, order=args.order, mode=args.mode, has_outgate=args.outgate) layers.append(feature_layer) self.output_layer = output_layer = Layer( n_in=n_d, n_out=n_V, activation=T.nnet.softmax, ) # feature computation starts here # (len*batch)*n_e xs_flat = embedding_layer.forward(idxs.ravel()) xs_flat = apply_dropout(xs_flat, dropout) if weights is not None: xs_w = weights[idxs.ravel()].dimshuffle((0, 'x')) xs_flat = xs_flat * xs_w # len*batch*n_e xs = xs_flat.reshape((idxs.shape[0], idxs.shape[1], n_e)) # (len*batch)*n_e xt_flat = embedding_layer.forward(idts.ravel()) xt_flat = apply_dropout(xt_flat, dropout) if weights is not None: xt_w = weights[idts.ravel()].dimshuffle((0, 'x')) xt_flat = xt_flat * xt_w # len*batch*n_e xt = xt_flat.reshape((idts.shape[0], idts.shape[1], n_e)) prev_hs = xs prev_ht = xt for i in range(depth): # len*batch*n_d hs = layers[i * 2].forward_all(prev_hs, return_c=True) ht = layers[i * 2 + 1].forward_all(prev_ht, hs[-1]) hs = hs[:, :, -n_d:] ht = ht[:, :, -n_d:] prev_hs = hs prev_ht = ht prev_hs = apply_dropout(hs, dropout) prev_ht = apply_dropout(ht, dropout) self.p_y_given_x = output_layer.forward( prev_ht.reshape((xt_flat.shape[0], n_d))) h_final = hs[-1] self.scores2 = -(h_final[1:] - h_final[0]).norm(2, axis=1) h_final = self.normalize_2d(h_final) self.scores = T.dot(h_final[1:], h_final[0]) # (len*batch) nll = T.nnet.categorical_crossentropy(self.p_y_given_x, idgs.ravel()) nll = nll.reshape(idgs.shape) self.nll = nll self.mask = mask = T.cast(T.neq(idgs, self.padding_id), theano.config.floatX) nll = T.sum(nll * mask, axis=0) #layers.append(embedding_layer) layers.append(output_layer) params = [] for l in self.layers: params += l.params self.params = params say("num of parameters: {}\n".format( sum(len(x.get_value(borrow=True).ravel()) for x in params))) l2_reg = None for p in params: if l2_reg is None: l2_reg = p.norm(2) else: l2_reg = l2_reg + p.norm(2) l2_reg = l2_reg * args.l2_reg self.loss = T.mean(nll) self.cost = self.loss + l2_reg
def train(self, ids_corpus, train, dev=None, test=None, heldout=None): args = self.args dropout_prob = np.float64(args.dropout).astype(theano.config.floatX) batch_size = args.batch_size padding_id = self.padding_id bos_id = self.bos_id eos_id = self.eos_id #train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id, args.loss) updates, lr, gnorm = create_optimization_updates( cost=self.cost, params=self.params, lr=args.learning_rate, method=args.learning)[:3] train_func = theano.function(inputs=[self.idxs, self.idys], outputs=[self.cost, self.loss, gnorm], updates=updates) eval_func = theano.function( inputs=[self.idxs], #outputs = self.scores2 outputs=self.scores) nll_func = theano.function(inputs=[self.idxs, self.idys], outputs=[self.nll, self.mask]) say("\tp_norm: {}\n".format(self.get_pnorm_stat())) result_table = PrettyTable( ["Epoch", "dev MAP", "dev MRR", "dev P@1", "dev P@5"] + ["tst MAP", "tst MRR", "tst P@1", "tst P@5"]) unchanged = 0 best_dev = -1 dev_MAP = dev_MRR = dev_P1 = dev_P5 = 0 test_MAP = test_MRR = test_P1 = test_P5 = 0 heldout_PPL = -1 start_time = 0 max_epoch = args.max_epoch for epoch in xrange(max_epoch): unchanged += 1 if unchanged > 8: break start_time = time.time() train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id, bos_id, eos_id, auto_encode=True) N = len(train_batches) train_cost = 0.0 train_loss = 0.0 train_loss2 = 0.0 for i in xrange(N): # get current batch t1, b1, t2 = train_batches[i] if args.use_title: idxs, idys = myio.create_one_batch(t1, t2, padding_id) cur_cost, cur_loss, grad_norm = train_func(idxs, idys) train_cost += cur_cost train_loss += cur_loss train_loss2 += cur_loss / idys.shape[0] if args.use_body: idxs, idys = myio.create_one_batch(b1, t2, padding_id) cur_cost, cur_loss, grad_norm = train_func(idxs, idys) train_cost += cur_cost train_loss += cur_loss train_loss2 += cur_loss / idys.shape[0] if i % 10 == 0: say("\r{}/{}".format(i, N)) if i == N - 1: self.dropout.set_value(0.0) if dev is not None: dev_MAP, dev_MRR, dev_P1, dev_P5 = self.evaluate( dev, eval_func) if test is not None: test_MAP, test_MRR, test_P1, test_P5 = self.evaluate( test, eval_func) if heldout is not None: heldout_PPL = self.evaluate_perplexity( heldout, nll_func) if dev_MRR > best_dev: unchanged = 0 best_dev = dev_MRR result_table.add_row([epoch] + [ "%.2f" % x for x in [dev_MAP, dev_MRR, dev_P1, dev_P5] + [test_MAP, test_MRR, test_P1, test_P5] ]) if args.model: self.save_model(args.model + ".pkl.gz") dropout_p = np.float64(args.dropout).astype( theano.config.floatX) self.dropout.set_value(dropout_p) say("\r\n\n") say( ( "Epoch {}\tcost={:.3f}\tloss={:.3f} {:.3f}\t" \ +"\tMRR={:.2f},{:.2f}\tPPL={:.1f}\t|g|={:.3f}\t[{:.3f}m]\n" ).format( epoch, train_cost / (i+1), train_loss / (i+1), train_loss2 / (i+1), dev_MRR, best_dev, heldout_PPL, float(grad_norm), (time.time()-start_time)/60.0 )) say("\tp_norm: {}\n".format(self.get_pnorm_stat())) say("\n") say("{}".format(result_table)) say("\n")
def ready(self): args = self.args weights = self.weights # len(title) * batch idts = self.idts = T.imatrix() # len(body) * batch idbs = self.idbs = T.imatrix() # num pairs * 3, or num queries * candidate size idps = self.idps = T.imatrix() dropout = self.dropout = theano.shared(np.float64(args.dropout).astype( theano.config.floatX)) dropout_op = self.dropout_op = Dropout(self.dropout) embedding_layer = self.embedding_layer activation = get_activation_by_name(args.activation) n_d = self.n_d = args.hidden_dim n_e = self.n_e = embedding_layer.n_d if args.layer.lower() == "rcnn": LayerType = RCNN elif args.layer.lower() == "lstm": LayerType = LSTM elif args.layer.lower() == "gru": LayerType = GRU depth = self.depth = args.depth layers = self.layers = [ ] for i in range(depth): if LayerType != RCNN: feature_layer = LayerType( n_in = n_e if i == 0 else n_d, n_out = n_d, activation = activation ) else: feature_layer = LayerType( n_in = n_e if i == 0 else n_d, n_out = n_d, activation = activation, order = args.order, mode = args.mode, has_outgate = args.outgate ) layers.append(feature_layer) # feature computation starts here # (len*batch)*n_e xt = embedding_layer.forward(idts.ravel()) if weights is not None: xt_w = weights[idts.ravel()].dimshuffle((0,'x')) xt = xt * xt_w # len*batch*n_e xt = xt.reshape((idts.shape[0], idts.shape[1], n_e)) xt = apply_dropout(xt, dropout) # (len*batch)*n_e xb = embedding_layer.forward(idbs.ravel()) if weights is not None: xb_w = weights[idbs.ravel()].dimshuffle((0,'x')) xb = xb * xb_w # len*batch*n_e xb = xb.reshape((idbs.shape[0], idbs.shape[1], n_e)) xb = apply_dropout(xb, dropout) prev_ht = self.xt = xt prev_hb = self.xb = xb for i in range(depth): # len*batch*n_d ht = layers[i].forward_all(prev_ht) hb = layers[i].forward_all(prev_hb) prev_ht = ht prev_hb = hb # normalize vectors if args.normalize: ht = self.normalize_3d(ht) hb = self.normalize_3d(hb) say("h_title dtype: {}\n".format(ht.dtype)) self.ht = ht self.hb = hb # average over length, ignore paddings # batch * d if args.average: ht = self.average_without_padding(ht, idts) hb = self.average_without_padding(hb, idbs) else: ht = ht[-1] hb = hb[-1] say("h_avg_title dtype: {}\n".format(ht.dtype)) # batch * d h_final = (ht+hb)*0.5 h_final = apply_dropout(h_final, dropout) h_final = self.normalize_2d(h_final) self.h_final = h_final say("h_final dtype: {}\n".format(ht.dtype)) # For testing: # first one in batch is query, the rest are candidate questions self.scores = T.dot(h_final[1:], h_final[0]) # For training: xp = h_final[idps.ravel()] xp = xp.reshape((idps.shape[0], idps.shape[1], n_d)) # num query * n_d query_vecs = xp[:,0,:] # num query pos_scores = T.sum(query_vecs*xp[:,1,:], axis=1) # num query * candidate size neg_scores = T.sum(query_vecs.dimshuffle((0,'x',1))*xp[:,2:,:], axis=2) # num query neg_scores = T.max(neg_scores, axis=1) diff = neg_scores - pos_scores + 1.0 loss = T.mean( (diff>0)*diff ) self.loss = loss params = [ ] for l in self.layers: params += l.params self.params = params say("num of parameters: {}\n".format( sum(len(x.get_value(borrow=True).ravel()) for x in params) )) l2_reg = None for p in params: if l2_reg is None: l2_reg = p.norm(2) else: l2_reg = l2_reg + p.norm(2) l2_reg = l2_reg * args.l2_reg self.cost = self.loss + l2_reg
def train(self, ids_corpus, train, dev=None, test=None): dropout_prob = np.float64(args.dropout).astype(theano.config.floatX) batch_size = args.batch_size padding_id = self.padding_id #train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id) updates, lr, gnorm = create_optimization_updates( cost = self.cost, params = self.params, lr = args.learning_rate, method = args.learning )[:3] train_func = theano.function( inputs = [ self.idts, self.idbs, self.idps ], outputs = [ self.cost, self.loss, gnorm ], updates = updates ) eval_func = theano.function( inputs = [ self.idts, self.idbs ], outputs = self.scores, on_unused_input='ignore' ) say("\tp_norm: {}\n".format( self.get_pnorm_stat() )) result_table = PrettyTable(["Epoch", "dev MAP", "dev MRR", "dev P@1", "dev P@5"] + ["tst MAP", "tst MRR", "tst P@1", "tst P@5"]) unchanged = 0 best_dev = -1 dev_MAP = dev_MRR = dev_P1 = dev_P5 = 0 test_MAP = test_MRR = test_P1 = test_P5 = 0 start_time = 0 max_epoch = args.max_epoch for epoch in xrange(max_epoch): unchanged += 1 if unchanged > 15: break start_time = time.time() train = myio.read_annotations(args.train) train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id, pad_left = not args.average) N =len(train_batches) train_loss = 0.0 train_cost = 0.0 for i in xrange(N): # get current batch idts, idbs, idps = train_batches[i] cur_cost, cur_loss, grad_norm = train_func(idts, idbs, idps) train_loss += cur_loss train_cost += cur_cost if i % 10 == 0: say("\r{}/{}".format(i,N)) if i == N-1: self.dropout.set_value(0.0) if dev is not None: dev_MAP, dev_MRR, dev_P1, dev_P5 = self.evaluate(dev, eval_func) if test is not None: test_MAP, test_MRR, test_P1, test_P5 = self.evaluate(test, eval_func) if dev_MRR > best_dev: unchanged = 0 best_dev = dev_MRR result_table.add_row( [ epoch ] + [ "%.2f" % x for x in [ dev_MAP, dev_MRR, dev_P1, dev_P5 ] + [ test_MAP, test_MRR, test_P1, test_P5 ] ] ) if args.save_model: self.save_model(args.save_model) dropout_p = np.float64(args.dropout).astype( theano.config.floatX) self.dropout.set_value(dropout_p) say("\r\n\n") say( ( "Epoch {}\tcost={:.3f}\tloss={:.3f}" \ +"\tMRR={:.2f},{:.2f}\t|g|={:.3f}\t[{:.3f}m]\n" ).format( epoch, train_cost / (i+1), train_loss / (i+1), dev_MRR, best_dev, float(grad_norm), (time.time()-start_time)/60.0 )) say("\tp_norm: {}\n".format( self.get_pnorm_stat() )) say("\n") say("{}".format(result_table)) say("\n")
def train(self, ids_corpus, train, dev=None, test=None): dropout_prob = np.float64(args.dropout).astype(theano.config.floatX) batch_size = args.batch_size padding_id = self.padding_id #train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id) if dev is not None: dev, dev_raw = dev if test is not None: test, test_raw = test if args.joint: updates_e, lr_e, gnorm_e = create_optimization_updates( cost = self.encoder.cost_e, #self.encoder.cost, params = self.encoder.params, lr = args.learning_rate*0.1, method = args.learning )[:3] else: updates_e = {} updates_g, lr_g, gnorm_g = create_optimization_updates( cost = self.encoder.cost_g, params = self.generator.params, lr = args.learning_rate, method = args.learning )[:3] train_func = theano.function( inputs = [ self.x, self.triples, self.pairs ], outputs = [ self.encoder.obj, self.encoder.loss, \ self.encoder.sparsity_cost, self.generator.p1, gnorm_g ], updates = updates_g.items() + updates_e.items() + self.generator.sample_updates, #no_default_updates = True, on_unused_input= "ignore" ) eval_func = theano.function( inputs = [ self.x ], outputs = self.encoder.scores ) eval_func2 = theano.function( inputs = [ self.x ], outputs = [ self.encoder.scores_z, self.generator.p1, self.z ], updates = self.generator.sample_updates, #no_default_updates = True ) say("\tp_norm: {}\n".format( self.get_pnorm_stat(self.encoder.params) )) say("\tp_norm: {}\n".format( self.get_pnorm_stat(self.generator.params) )) result_table = PrettyTable(["Epoch", "dev MAP", "dev MRR", "dev P@1", "dev P@5"] + ["tst MAP", "tst MRR", "tst P@1", "tst P@5"]) last_train_avg_cost = None tolerance = 0.5 + 1e-3 unchanged = 0 best_dev = -1 dev_MAP = dev_MRR = dev_P1 = dev_P5 = 0 test_MAP = test_MRR = test_P1 = test_P5 = 0 start_time = 0 max_epoch = args.max_epoch for epoch in xrange(max_epoch): unchanged += 1 if unchanged > 20: break start_time = time.time() train = myio.read_annotations(args.train) train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id, pad_left=not args.average, merge=args.merge) N =len(train_batches) more = True param_bak = [ p.get_value(borrow=False) for p in self.params ] while more: train_loss = 0.0 train_cost = 0.0 train_scost = 0.0 train_p1 = 0.0 for i in xrange(N): # get current batch idts, triples, pairs = train_batches[i] cur_cost, cur_loss, cur_scost, cur_p1, gnormg = train_func(idts, triples, pairs) train_loss += cur_loss train_cost += cur_cost train_scost += cur_scost train_p1 += cur_p1 if i % 10 == 0: say("\r{}/{} {:.3f}".format(i,N,train_p1/(i+1))) cur_train_avg_cost = train_cost / N more = False if last_train_avg_cost is not None: if cur_train_avg_cost > last_train_avg_cost*(1+tolerance): more = True say("\nTrain cost {} --> {}\n".format( last_train_avg_cost, cur_train_avg_cost )) if more: lr_val = lr_g.get_value()*0.5 if lr_val < 1e-5: return lr_val = np.float64(lr_val).astype(theano.config.floatX) lr_g.set_value(lr_val) lr_e.set_value(lr_val) say("Decrease learning rate to {}\n".format(float(lr_val))) for p, v in zip(self.params, param_bak): p.set_value(v) continue last_train_avg_cost = cur_train_avg_cost say("\r\n\n") say( ( "Epoch {} cost={:.3f} loss={:.3f} scost={:.3f}" \ +" P[1]={:.3f} |g|={:.3f}\t[{:.3f}m]\n" ).format( epoch, train_cost / N, train_loss / N, train_scost / N, train_p1 / N, float(gnormg), (time.time()-start_time)/60.0 )) say("\tp_norm: {}\n".format( self.get_pnorm_stat(self.encoder.params) )) say("\tp_norm: {}\n".format( self.get_pnorm_stat(self.generator.params) )) self.dropout.set_value(0.0) if dev is not None: full_MAP, full_MRR, full_P1, full_P5 = self.evaluate(dev, eval_func) dev_MAP, dev_MRR, dev_P1, dev_P5, dev_PZ1, dev_PT = self.evaluate_z(dev, dev_raw, ids_corpus, eval_func2) if test is not None: test_MAP, test_MRR, test_P1, test_P5, test_PZ1, test_PT = \ self.evaluate_z(test, test_raw, ids_corpus, eval_func2) if dev_MAP > best_dev: best_dev = dev_MAP unchanged = 0 say("\n") say(" fMAP={:.2f} fMRR={:.2f} fP1={:.2f} fP5={:.2f}\n".format( full_MAP, full_MRR, full_P1, full_P5 )) say("\n") say((" dMAP={:.2f} dMRR={:.2f} dP1={:.2f} dP5={:.2f}" + " dP[1]={:.3f} d%T={:.3f} best_dev={:.2f}\n").format( dev_MAP, dev_MRR, dev_P1, dev_P5, dev_PZ1, dev_PT, best_dev )) result_table.add_row( [ epoch ] + [ "%.2f" % x for x in [ dev_MAP, dev_MRR, dev_P1, dev_P5 ] + [ test_MAP, test_MRR, test_P1, test_P5 ] ] ) if unchanged == 0: say("\n") say((" tMAP={:.2f} tMRR={:.2f} tP1={:.2f} tP5={:.2f}" + " tP[1]={:.3f} t%T={:.3f}\n").format( test_MAP, test_MRR, test_P1, test_P5, test_PZ1, test_PT )) if args.dump_rationale: self.evaluate_z(dev+test, dev_raw+test_raw, ids_corpus, eval_func2, args.dump_rationale) #if args.save_model: # self.save_model(args.save_model) dropout_p = np.float64(args.dropout).astype( theano.config.floatX) self.dropout.set_value(dropout_p) say("\n") say("{}".format(result_table)) say("\n") if train_p1/N <= 1e-4 or train_p1/N+1e-4 >= 1.0: break
def ready(self): embedding_layer = self.embedding_layer args = self.args padding_id = self.padding_id weights = self.weights dropout = self.dropout = theano.shared( np.float64(args.dropout).astype(theano.config.floatX) ) # len*batch x = self.x = T.imatrix() n_d = args.hidden_dim2 n_e = embedding_layer.n_d activation = get_activation_by_name(args.activation) layers = self.layers = [ ] layer_type = args.layer.lower() for i in xrange(2): if layer_type == "rcnn": l = RCNN( n_in = n_e,# if i == 0 else n_d, n_out = n_d, activation = activation, order = args.order ) elif layer_type == "lstm": l = LSTM( n_in = n_e,# if i == 0 else n_d, n_out = n_d, activation = activation ) layers.append(l) # len * batch masks = T.cast(T.neq(x, padding_id), "float32") #masks = masks.dimshuffle((0,1,"x")) # (len*batch)*n_e embs = embedding_layer.forward(x.ravel()) if weights is not None: embs_w = weights[x.ravel()].dimshuffle((0,'x')) embs = embs * embs_w # len*batch*n_e embs = embs.reshape((x.shape[0], x.shape[1], n_e)) embs = apply_dropout(embs, dropout) self.word_embs = embs flipped_embs = embs[::-1] # len*bacth*n_d h1 = layers[0].forward_all(embs) h2 = layers[1].forward_all(flipped_embs) h_final = T.concatenate([h1, h2[::-1]], axis=2) h_final = apply_dropout(h_final, dropout) size = n_d * 2 output_layer = self.output_layer = ZLayer( n_in = size, n_hidden = n_d, activation = activation ) # sample z given text (i.e. x) z_pred, sample_updates = output_layer.sample_all(h_final) # we are computing approximated gradient by sampling z; # so should mark sampled z not part of the gradient propagation path # z_pred = self.z_pred = theano.gradient.disconnected_grad(z_pred) self.sample_updates = sample_updates print "z_pred", z_pred.ndim self.p1 = T.sum(masks*z_pred) / (T.sum(masks) + 1e-8) # len*batch*1 probs = output_layer.forward_all(h_final, z_pred) print "probs", probs.ndim logpz = - T.nnet.binary_crossentropy(probs, z_pred) * masks logpz = self.logpz = logpz.reshape(x.shape) probs = self.probs = probs.reshape(x.shape) # batch z = z_pred self.zsum = T.sum(z, axis=0, dtype=theano.config.floatX) self.zdiff = T.sum(T.abs_(z[1:]-z[:-1]), axis=0, dtype=theano.config.floatX) params = self.params = [ ] for l in layers + [ output_layer ]: for p in l.params: params.append(p) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in params) say("total # parameters: {}\n".format(nparams)) l2_cost = None for p in params: if l2_cost is None: l2_cost = T.sum(p**2) else: l2_cost = l2_cost + T.sum(p**2) l2_cost = l2_cost * args.l2_reg self.l2_cost = l2_cost
def ready(self): args = self.args weights = self.weights # len(source) * batch idxs = self.idxs = T.imatrix() # len(target) * batch idys = self.idys = T.imatrix() idts = idys[:-1] idgs = idys[1:] dropout = self.dropout = theano.shared(np.float64(args.dropout).astype( theano.config.floatX)) embedding_layer = self.embedding_layer activation = get_activation_by_name(args.activation) n_d = self.n_d = args.hidden_dim n_e = self.n_e = embedding_layer.n_d n_V = self.n_V = embedding_layer.n_V if args.layer.lower() == "rcnn": LayerType = RCNN elif args.layer.lower() == "lstm": LayerType = LSTM elif args.layer.lower() == "gru": LayerType = GRU depth = self.depth = args.depth layers = self.layers = [ ] for i in range(depth*2): if LayerType != RCNN: feature_layer = LayerType( n_in = n_e if i/2 == 0 else n_d, n_out = n_d, activation = activation ) else: feature_layer = LayerType( n_in = n_e if i/2 == 0 else n_d, n_out = n_d, activation = activation, order = args.order, mode = args.mode, has_outgate = args.outgate ) layers.append(feature_layer) self.output_layer = output_layer = Layer( n_in = n_d, n_out = n_V, activation = T.nnet.softmax, ) # feature computation starts here # (len*batch)*n_e xs_flat = embedding_layer.forward(idxs.ravel()) xs_flat = apply_dropout(xs_flat, dropout) if weights is not None: xs_w = weights[idxs.ravel()].dimshuffle((0,'x')) xs_flat = xs_flat * xs_w # len*batch*n_e xs = xs_flat.reshape((idxs.shape[0], idxs.shape[1], n_e)) # (len*batch)*n_e xt_flat = embedding_layer.forward(idts.ravel()) xt_flat = apply_dropout(xt_flat, dropout) if weights is not None: xt_w = weights[idts.ravel()].dimshuffle((0,'x')) xt_flat = xt_flat * xt_w # len*batch*n_e xt = xt_flat.reshape((idts.shape[0], idts.shape[1], n_e)) prev_hs = xs prev_ht = xt for i in range(depth): # len*batch*n_d hs = layers[i*2].forward_all(prev_hs, return_c=True) ht = layers[i*2+1].forward_all(prev_ht, hs[-1]) hs = hs[:,:,-n_d:] ht = ht[:,:,-n_d:] prev_hs = hs prev_ht = ht prev_hs = apply_dropout(hs, dropout) prev_ht = apply_dropout(ht, dropout) self.p_y_given_x = output_layer.forward(prev_ht.reshape( (xt_flat.shape[0], n_d) )) h_final = hs[-1] self.scores2 = -(h_final[1:]-h_final[0]).norm(2,axis=1) h_final = self.normalize_2d(h_final) self.scores = T.dot(h_final[1:], h_final[0]) # (len*batch) nll = T.nnet.categorical_crossentropy( self.p_y_given_x, idgs.ravel() ) nll = nll.reshape(idgs.shape) self.nll = nll self.mask = mask = T.cast(T.neq(idgs, self.padding_id), theano.config.floatX) nll = T.sum(nll*mask, axis=0) #layers.append(embedding_layer) layers.append(output_layer) params = [ ] for l in self.layers: params += l.params self.params = params say("num of parameters: {}\n".format( sum(len(x.get_value(borrow=True).ravel()) for x in params) )) l2_reg = None for p in params: if l2_reg is None: l2_reg = p.norm(2) else: l2_reg = l2_reg + p.norm(2) l2_reg = l2_reg * args.l2_reg self.loss = T.mean(nll) self.cost = self.loss + l2_reg
def train_model(self, ids_corpus, train, dev=None, test=None): with tf.Session() as sess: result_table = PrettyTable([ "Epoch", "Step", "dev MAP", "dev MRR", "dev P@1", "dev P@5", "tst MAP", "tst MRR", "tst P@1", "tst P@5" ]) dev_MAP = dev_MRR = dev_P1 = dev_P5 = 0 test_MAP = test_MRR = test_P1 = test_P5 = 0 best_dev = -1 # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(self.args.learning_rate) train_op = optimizer.minimize(self.cost, global_step=global_step) print '\n\ntrainable params: ', tf.trainable_variables(), '\n\n' sess.run(tf.global_variables_initializer()) emb = sess.run(self.embeddings) print '\nemb {}\n'.format(emb[10][0:10]) if self.init_assign_ops != {}: print 'assigning trained values ...\n' sess.run(self.init_assign_ops) emb = sess.run(self.embeddings) print '\nemb {}\n'.format(emb[10][0:10]) self.init_assign_ops = {} if self.args.save_dir != "": print("Writing to {}\n".format(self.args.save_dir)) # TRAIN LOSS train_loss_writer = tf.summary.FileWriter( os.path.join(self.args.save_dir, "summaries", "train", "loss"), ) train_cost_writer = tf.summary.FileWriter( os.path.join(self.args.save_dir, "summaries", "train", "cost"), sess.graph) # VARIABLE NORM p_norm_summaries = {} p_norm_placeholders = {} for param_name, param_norm in self.get_pnorm_stat( sess).iteritems(): p_norm_placeholders[param_name] = tf.placeholder(tf.float32) p_norm_summaries[param_name] = tf.summary.scalar( param_name, p_norm_placeholders[param_name]) p_norm_summary_op = tf.summary.merge(p_norm_summaries.values()) p_norm_summary_dir = os.path.join(self.args.save_dir, "summaries", "p_norm") p_norm_summary_writer = tf.summary.FileWriter(p_norm_summary_dir, ) # DEV LOSS & EVAL dev_loss0_writer = tf.summary.FileWriter( os.path.join(self.args.save_dir, "summaries", "dev", "loss0"), ) dev_loss1_writer = tf.summary.FileWriter( os.path.join(self.args.save_dir, "summaries", "dev", "loss1"), ) dev_loss2_writer = tf.summary.FileWriter( os.path.join(self.args.save_dir, "summaries", "dev", "loss2"), ) dev_eval_writer1 = tf.summary.FileWriter( os.path.join(self.args.save_dir, "summaries", "dev", "MAP"), ) dev_eval_writer2 = tf.summary.FileWriter( os.path.join(self.args.save_dir, "summaries", "dev", "MRR"), ) dev_eval_writer3 = tf.summary.FileWriter( os.path.join(self.args.save_dir, "summaries", "dev", "Pat1"), ) dev_eval_writer4 = tf.summary.FileWriter( os.path.join(self.args.save_dir, "summaries", "dev", "Pat5"), ) loss = tf.placeholder(tf.float32) loss_summary = tf.summary.scalar("loss", loss) dev_eval = tf.placeholder(tf.float32) dev_summary = tf.summary.scalar("QR_evaluation", dev_eval) cost = tf.placeholder(tf.float32) cost_summary = tf.summary.scalar("cost", cost) # train_eval = tf.placeholder(tf.float32) # train_summary = tf.summary.scalar("QR_train", train_eval) if self.args.save_dir != "": checkpoint_dir = os.path.join(self.args.save_dir, "checkpoints") checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) patience = 8 if 'patience' not in self.args else self.args.patience unchanged = 0 max_epoch = self.args.max_epoch for epoch in xrange(max_epoch): unchanged += 1 if unchanged > patience: break train_batches = myio.create_batches(ids_corpus, train, self.args.batch_size, self.padding_id, pad_left=False) N = len(train_batches) train_loss = 0.0 train_cost = 0.0 for i in xrange(N): idts, idbs, idps, qpp = train_batches[i] cur_step, cur_loss, cur_cost = self.train_batch( idts, idbs, idps, qpp, train_op, global_step, sess) summary = sess.run(loss_summary, {loss: cur_loss}) train_loss_writer.add_summary(summary, cur_step) train_loss_writer.flush() summary = sess.run(cost_summary, {cost: cur_cost}) train_cost_writer.add_summary(summary, cur_step) train_cost_writer.flush() train_loss += cur_loss train_cost += cur_cost if i % 10 == 0: say("\r{}/{}".format(i, N)) if i == N - 1 or (i % 10 == 0 and 'testing' in self.args and self.args.testing): # EVAL if dev: dev_MAP, dev_MRR, dev_P1, dev_P5, dloss0, dloss1, dloss2 = self.evaluate( dev, sess) summary = sess.run(loss_summary, {loss: dloss0}) dev_loss0_writer.add_summary(summary, cur_step) dev_loss0_writer.flush() summary = sess.run(loss_summary, {loss: dloss1}) dev_loss1_writer.add_summary(summary, cur_step) dev_loss1_writer.flush() summary = sess.run(loss_summary, {loss: dloss2}) dev_loss2_writer.add_summary(summary, cur_step) dev_loss2_writer.flush() summary = sess.run(dev_summary, {dev_eval: dev_MAP}) dev_eval_writer1.add_summary(summary, cur_step) dev_eval_writer1.flush() summary = sess.run(dev_summary, {dev_eval: dev_MRR}) dev_eval_writer2.add_summary(summary, cur_step) dev_eval_writer2.flush() summary = sess.run(dev_summary, {dev_eval: dev_P1}) dev_eval_writer3.add_summary(summary, cur_step) dev_eval_writer3.flush() summary = sess.run(dev_summary, {dev_eval: dev_P5}) dev_eval_writer4.add_summary(summary, cur_step) dev_eval_writer4.flush() feed_dict = {} for param_name, param_norm in self.get_pnorm_stat( sess).iteritems(): feed_dict[p_norm_placeholders[ param_name]] = param_norm _p_norm_sum = sess.run(p_norm_summary_op, feed_dict) p_norm_summary_writer.add_summary( _p_norm_sum, cur_step) if test: test_MAP, test_MRR, test_P1, test_P5, tloss0, tloss1, tloss2 = self.evaluate( test, sess) if self.args.performance == "MRR" and dev_MRR > best_dev: unchanged = 0 best_dev = dev_MRR result_table.add_row([ epoch, cur_step, dev_MAP, dev_MRR, dev_P1, dev_P5, test_MAP, test_MRR, test_P1, test_P5 ]) if self.args.save_dir != "": self.save(sess, checkpoint_prefix, cur_step) elif self.args.performance == "MAP" and dev_MAP > best_dev: unchanged = 0 best_dev = dev_MAP result_table.add_row([ epoch, cur_step, dev_MAP, dev_MRR, dev_P1, dev_P5, test_MAP, test_MRR, test_P1, test_P5 ]) if self.args.save_dir != "": self.save(sess, checkpoint_prefix, cur_step) say("\r\n\nEpoch {}\tcost={:.3f}\tloss={:.3f}\tMRR={:.2f},MAP={:.2f}\n" .format( epoch, train_cost / (i + 1), # i.e. divided by N training batches train_loss / (i + 1), # i.e. divided by N training batches dev_MRR, dev_MAP)) say("\n{}\n".format(result_table)) myio.say("\tp_norm: {}\n".format( self.get_pnorm_stat(sess)))
def main(args): raw_corpus = myio.read_corpus(args.corpus) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d=args.hidden_dim, cut_off=args.cut_off, embs=load_embedding_iterator(args.embeddings) if args.embeddings else None) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer) say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus))) padding_id = embedding_layer.vocab_map["<padding>"] bos_id = embedding_layer.vocab_map["<s>"] eos_id = embedding_layer.vocab_map["</s>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.dev: dev = myio.read_annotations(args.dev, K_neg=20, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev, padding_id) if args.test: test = myio.read_annotations(args.test, K_neg=20, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test, padding_id) if args.heldout: with open(args.heldout) as fin: heldout_ids = fin.read().split() heldout_corpus = dict( (id, ids_corpus[id]) for id in heldout_ids if id in ids_corpus) train_corpus = dict((id, ids_corpus[id]) for id in ids_corpus if id not in heldout_corpus) heldout = myio.create_batches(heldout_corpus, [], args.batch_size, padding_id, bos_id, eos_id, auto_encode=True) heldout = [ myio.create_one_batch(b1, t2, padding_id) for t1, b1, t2 in heldout ] say("heldout examples={}\n".format(len(heldout_corpus))) if args.train: model = Model(args, embedding_layer, weights=weights if args.reweight else None) start_time = time.time() train = myio.read_annotations(args.train) if not args.use_anno: train = [] train_batches = myio.create_batches(ids_corpus, train, args.batch_size, model.padding_id, model.bos_id, model.eos_id, auto_encode=True) say("{} to create batches\n".format(time.time() - start_time)) model.ready() model.train(ids_corpus if not args.heldout else train_corpus, train, dev if args.dev else None, test if args.test else None, heldout if args.heldout else None)
def ready(self): generator = self.generator args = self.args weights = self.weights dropout = generator.dropout # len(text) * batch idts = generator.x z = generator.z_pred z = z.dimshuffle((0, 1, "x")) # batch * 2 pairs = self.pairs = T.imatrix() # num pairs * 3, or num queries * candidate size triples = self.triples = T.imatrix() embedding_layer = self.embedding_layer activation = get_activation_by_name(args.activation) n_d = self.n_d = args.hidden_dim n_e = self.n_e = embedding_layer.n_d if args.layer.lower() == "rcnn": LayerType = RCNN LayerType2 = ExtRCNN elif args.layer.lower() == "lstm": LayerType = LSTM LayerType2 = ExtLSTM #elif args.layer.lower() == "gru": # LayerType = GRU depth = self.depth = args.depth layers = self.layers = [] for i in range(depth): if LayerType != RCNN: feature_layer = LayerType(n_in=n_e if i == 0 else n_d, n_out=n_d, activation=activation) else: feature_layer = LayerType(n_in=n_e if i == 0 else n_d, n_out=n_d, activation=activation, order=args.order, mode=args.mode, has_outgate=args.outgate) layers.append(feature_layer) extlayers = self.extlayers = [] for i in range(depth): if LayerType != RCNN: feature_layer = LayerType2(n_in=n_e if i == 0 else n_d, n_out=n_d, activation=activation) else: feature_layer = LayerType2(n_in=n_e if i == 0 else n_d, n_out=n_d, activation=activation, order=args.order, mode=args.mode, has_outgate=args.outgate) feature_layer.copy_params(layers[i]) extlayers.append(feature_layer) # feature computation starts here xt = generator.word_embs # encode full text into representation prev_ht = self.xt = xt for i in range(depth): # len*batch*n_d ht = layers[i].forward_all(prev_ht) prev_ht = ht # encode selected text into representation prev_htz = self.xt = xt for i in range(depth): # len*batch*n_d htz = extlayers[i].forward_all(prev_htz, z) prev_htz = htz # normalize vectors if args.normalize: ht = self.normalize_3d(ht) htz = self.normalize_3d(htz) say("h_title dtype: {}\n".format(ht.dtype)) self.ht = ht self.htz = htz # average over length, ignore paddings # batch * d if args.average: ht = self.average_without_padding(ht, idts) htz = self.average_without_padding(htz, idts, z) else: ht = ht[-1] htz = htz[-1] say("h_avg_title dtype: {}\n".format(ht.dtype)) # batch * d h_final = apply_dropout(ht, dropout) h_final = self.normalize_2d(h_final) hz_final = apply_dropout(htz, dropout) hz_final = self.normalize_2d(hz_final) self.h_final = h_final self.hz_final = hz_final say("h_final dtype: {}\n".format(ht.shape)) # For testing: # first one in batch is query, the rest are candidate questions self.scores = T.dot(h_final[1:], h_final[0]) self.scores_z = T.dot(hz_final[1:], hz_final[0]) # For training encoder: xp = h_final[triples.ravel()] xp = xp.reshape((triples.shape[0], triples.shape[1], n_d)) # num query * n_d query_vecs = xp[:, 0, :] # num query pos_scores = T.sum(query_vecs * xp[:, 1, :], axis=1) # num query * candidate size neg_scores = T.sum(query_vecs.dimshuffle((0, 'x', 1)) * xp[:, 2:, :], axis=2) # num query neg_scores = T.max(neg_scores, axis=1) diff = neg_scores - pos_scores + 1.0 hinge_loss = T.mean((diff > 0) * diff) # For training generator # batch self_cosine_distance = 1.0 - T.sum(hz_final * h_final, axis=1) pair_cosine_distance = 1.0 - T.sum(hz_final * h_final[pairs[:, 1]], axis=1) alpha = args.alpha loss_vec = self_cosine_distance * alpha + pair_cosine_distance * ( 1 - alpha) #loss_vec = self_cosine_distance*0.2 + pair_cosine_distance*0.8 zsum = generator.zsum zdiff = generator.zdiff logpz = generator.logpz sfactor = args.sparsity cfactor = args.sparsity * args.coherent scost_vec = zsum * sfactor + zdiff * cfactor # batch cost_vec = loss_vec + scost_vec cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0)) loss = self.loss = T.mean(loss_vec) sparsity_cost = self.sparsity_cost = T.mean(scost_vec) self.obj = loss + sparsity_cost params = [] for l in self.layers: params += l.params self.params = params say("num of parameters: {}\n".format( sum(len(x.get_value(borrow=True).ravel()) for x in params))) l2_reg = None for p in params: if l2_reg is None: l2_reg = T.sum(p**2) #p.norm(2) else: l2_reg = l2_reg + T.sum(p**2) #p.norm(2) l2_reg = l2_reg * args.l2_reg self.l2_cost = l2_reg beta = args.beta self.cost_g = cost_logpz + generator.l2_cost self.cost_e = hinge_loss + loss * beta + l2_reg
def train(self, ids_corpus, train, dev=None, test=None): dropout_prob = np.float64(args.dropout).astype(theano.config.floatX) batch_size = args.batch_size padding_id = self.padding_id #train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id) if dev is not None: dev, dev_raw = dev if test is not None: test, test_raw = test if args.joint: updates_e, lr_e, gnorm_e = create_optimization_updates( cost=self.encoder.cost_e, #self.encoder.cost, params=self.encoder.params, lr=args.learning_rate * 0.1, method=args.learning)[:3] else: updates_e = {} updates_g, lr_g, gnorm_g = create_optimization_updates( cost=self.encoder.cost_g, params=self.generator.params, lr=args.learning_rate, method=args.learning)[:3] train_func = theano.function( inputs = [ self.x, self.triples, self.pairs ], outputs = [ self.encoder.obj, self.encoder.loss, \ self.encoder.sparsity_cost, self.generator.p1, gnorm_g ], # updates = updates_g.items() + updates_e.items() + self.generator.sample_updates, updates = collections.OrderedDict(list(updates_g.items()) + list(updates_e.items()) + list(self.generator.sample_updates.items())), #no_default_updates = True, on_unused_input= "ignore" ) eval_func = theano.function(inputs=[self.x], outputs=self.encoder.scores) eval_func2 = theano.function( inputs=[self.x], outputs=[self.encoder.scores_z, self.generator.p1, self.z], updates=self.generator.sample_updates, #no_default_updates = True ) say("\tp_norm: {}\n".format(self.get_pnorm_stat(self.encoder.params))) say("\tp_norm: {}\n".format(self.get_pnorm_stat( self.generator.params))) result_table = PrettyTable( ["Epoch", "dev MAP", "dev MRR", "dev P@1", "dev P@5"] + ["tst MAP", "tst MRR", "tst P@1", "tst P@5"]) last_train_avg_cost = None tolerance = 0.5 + 1e-3 unchanged = 0 best_dev = -1 dev_MAP = dev_MRR = dev_P1 = dev_P5 = 0 test_MAP = test_MRR = test_P1 = test_P5 = 0 start_time = 0 max_epoch = args.max_epoch for epoch in range(max_epoch): unchanged += 1 if unchanged > 20: break start_time = time.time() train = myio.read_annotations(args.train) train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id, pad_left=not args.average, merge=args.merge) N = len(train_batches) more = True param_bak = [p.get_value(borrow=False) for p in self.params] while more: train_loss = 0.0 train_cost = 0.0 train_scost = 0.0 train_p1 = 0.0 for i in range(N): # get current batch idts, triples, pairs = train_batches[i] cur_cost, cur_loss, cur_scost, cur_p1, gnormg = train_func( idts, triples, pairs) train_loss += cur_loss train_cost += cur_cost train_scost += cur_scost train_p1 += cur_p1 if i % 10 == 0: say("\r{}/{} {:.3f}".format(i, N, train_p1 / (i + 1))) cur_train_avg_cost = train_cost / N more = False if last_train_avg_cost is not None: if cur_train_avg_cost > last_train_avg_cost * (1 + tolerance): more = True say("\nTrain cost {} --> {}\n".format( last_train_avg_cost, cur_train_avg_cost)) if more: lr_val = lr_g.get_value() * 0.5 if lr_val < 1e-5: return lr_val = np.float64(lr_val).astype(theano.config.floatX) lr_g.set_value(lr_val) lr_e.set_value(lr_val) say("Decrease learning rate to {}\n".format(float(lr_val))) for p, v in zip(self.params, param_bak): p.set_value(v) continue last_train_avg_cost = cur_train_avg_cost say("\r\n\n") say( ( "Epoch {} cost={:.3f} loss={:.3f} scost={:.3f}" \ +" P[1]={:.3f} |g|={:.3f}\t[{:.3f}m]\n" ).format( epoch, train_cost / N, train_loss / N, train_scost / N, train_p1 / N, float(gnormg), (time.time()-start_time)/60.0 )) say("\tp_norm: {}\n".format( self.get_pnorm_stat(self.encoder.params))) say("\tp_norm: {}\n".format( self.get_pnorm_stat(self.generator.params))) self.dropout.set_value(0.0) if dev is not None: full_MAP, full_MRR, full_P1, full_P5 = self.evaluate( dev, eval_func) dev_MAP, dev_MRR, dev_P1, dev_P5, dev_PZ1, dev_PT = self.evaluate_z( dev, dev_raw, ids_corpus, eval_func2) if test is not None: test_MAP, test_MRR, test_P1, test_P5, test_PZ1, test_PT = \ self.evaluate_z(test, test_raw, ids_corpus, eval_func2) if dev_MAP > best_dev: best_dev = dev_MAP unchanged = 0 say("\n") say(" fMAP={:.2f} fMRR={:.2f} fP1={:.2f} fP5={:.2f}\n".format( full_MAP, full_MRR, full_P1, full_P5)) say("\n") say((" dMAP={:.2f} dMRR={:.2f} dP1={:.2f} dP5={:.2f}" + " dP[1]={:.3f} d%T={:.3f} best_dev={:.2f}\n").format( dev_MAP, dev_MRR, dev_P1, dev_P5, dev_PZ1, dev_PT, best_dev)) result_table.add_row([epoch] + [ "%.2f" % x for x in [dev_MAP, dev_MRR, dev_P1, dev_P5] + [test_MAP, test_MRR, test_P1, test_P5] ]) if unchanged == 0: say("\n") say((" tMAP={:.2f} tMRR={:.2f} tP1={:.2f} tP5={:.2f}" + " tP[1]={:.3f} t%T={:.3f}\n").format( test_MAP, test_MRR, test_P1, test_P5, test_PZ1, test_PT)) if args.dump_rationale: self.evaluate_z(dev + test, dev_raw + test_raw, ids_corpus, eval_func2, args.dump_rationale) #if args.save_model: # self.save_model(args.save_model) dropout_p = np.float64(args.dropout).astype( theano.config.floatX) self.dropout.set_value(dropout_p) say("\n") say("{}".format(result_table)) say("\n") if train_p1 / N <= 1e-4 or train_p1 / N + 1e-4 >= 1.0: break
def ready(self): embedding_layer = self.embedding_layer args = self.args padding_id = self.padding_id weights = self.weights dropout = self.dropout = theano.shared( np.float64(args.dropout).astype(theano.config.floatX)) # len*batch x = self.x = T.imatrix() n_d = args.hidden_dim2 n_e = embedding_layer.n_d activation = get_activation_by_name(args.activation) layers = self.layers = [] layer_type = args.layer.lower() for i in range(2): if layer_type == "rcnn": l = RCNN( n_in=n_e, # if i == 0 else n_d, n_out=n_d, activation=activation, order=args.order) elif layer_type == "lstm": l = LSTM( n_in=n_e, # if i == 0 else n_d, n_out=n_d, activation=activation) layers.append(l) # len * batch masks = T.cast(T.neq(x, padding_id), "float32") #masks = masks.dimshuffle((0,1,"x")) # (len*batch)*n_e embs = embedding_layer.forward(x.ravel()) if weights is not None: embs_w = weights[x.ravel()].dimshuffle((0, 'x')) embs = embs * embs_w # len*batch*n_e embs = embs.reshape((x.shape[0], x.shape[1], n_e)) embs = apply_dropout(embs, dropout) self.word_embs = embs flipped_embs = embs[::-1] # len*bacth*n_d h1 = layers[0].forward_all(embs) h2 = layers[1].forward_all(flipped_embs) h_final = T.concatenate([h1, h2[::-1]], axis=2) h_final = apply_dropout(h_final, dropout) size = n_d * 2 output_layer = self.output_layer = ZLayer(n_in=size, n_hidden=n_d, activation=activation) # sample z given text (i.e. x) z_pred, sample_updates = output_layer.sample_all(h_final) # we are computing approximated gradient by sampling z; # so should mark sampled z not part of the gradient propagation path # z_pred = self.z_pred = theano.gradient.disconnected_grad(z_pred) self.sample_updates = sample_updates # print "z_pred", z_pred.ndim self.p1 = T.sum(masks * z_pred) / (T.sum(masks) + 1e-8) # len*batch*1 probs = output_layer.forward_all(h_final, z_pred) # print "probs", probs.ndim logpz = -T.nnet.binary_crossentropy(probs, z_pred) * masks logpz = self.logpz = logpz.reshape(x.shape) probs = self.probs = probs.reshape(x.shape) # batch z = z_pred self.zsum = T.sum(z, axis=0, dtype=theano.config.floatX) self.zdiff = T.sum(T.abs_(z[1:] - z[:-1]), axis=0, dtype=theano.config.floatX) params = self.params = [] for l in layers + [output_layer]: for p in l.params: params.append(p) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in params) say("total # parameters: {}\n".format(nparams)) l2_cost = None for p in params: if l2_cost is None: l2_cost = T.sum(p**2) else: l2_cost = l2_cost + T.sum(p**2) l2_cost = l2_cost * args.l2_reg self.l2_cost = l2_cost
def main(args): raw_corpus = myio.read_corpus(args.corpus, args.translations or None, args.translatable_ids or None, args.generated_questions_train or None) generated_questions_eval = myio.read_generated_questions( args.generated_questions) embedding_layer = None if args.trainable_embeddings == 1: embedding_layer = myio.create_embedding_layer( raw_corpus, n_d=args.hidden_dim, cut_off=args.cut_off, embs=load_embedding_iterator(args.embeddings) if args.embeddings else None, fix_init_embs=False) else: embedding_layer = myio.create_embedding_layer( raw_corpus, n_d=args.hidden_dim, cut_off=args.cut_off, embs=load_embedding_iterator(args.embeddings) if args.embeddings else None) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, max_len=args.max_seq_len, generated_questions=generated_questions_eval) say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus))) padding_id = embedding_layer.vocab_map["<padding>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.dev: # dev = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1) dev = myio.read_annotations(args.dev, K_neg=args.dev_pool_size, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev, padding_id, pad_left=not args.average) if args.test: test = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test, padding_id, pad_left=not args.average) if args.train: start_time = time.time() train = myio.read_annotations( args.train, training_data_percent=args.training_data_percent) train_batches = myio.create_batches(ids_corpus, train, args.batch_size, padding_id, pad_left=not args.average, include_generated_questions=True) say("{} to create batches\n".format(time.time() - start_time)) say("{} batches, {} tokens in total, {} triples in total\n".format( len(train_batches), sum(len(x[0].ravel()) + len(x[1].ravel()) for x in train_batches), sum(len(x[2].ravel()) for x in train_batches))) train_batches = None model = Model(args, embedding_layer, weights=weights if args.reweight else None) # print('args.average: '+args.average) model.ready() # # # set parameters using pre-trained network if args.do_train == 1: if args.load_pretrain: model.load_pretrained_parameters(args) model.train(ids_corpus, train, dev if args.dev else None, test if args.test else None) # AVERAGE THE PREDICTIONS OBTAINED BY RUNNING THE MODEL 10 TIMES if args.do_evaluate == 1: model.load_pretrained_parameters(args) # model.set_model(model.load_model(args.load_pretrain)) for i in range(1): r = model.just_eval(dev if args.dev else None, test if args.test else None) # ANALYZE the results if len(args.analyze_file.strip()) > 0: model.load_pretrained_parameters(args) file_name = args.analyze_file.strip( ) # 'AskUbuntu.Rcnn_analysis3.gt(es)-gt.txt' model.analyze(file_name, embedding_layer, dev)
def train(self, ids_corpus, train, dev=None, test=None, heldout=None): args = self.args dropout_prob = np.float64(args.dropout).astype(theano.config.floatX) batch_size = args.batch_size padding_id = self.padding_id bos_id = self.bos_id eos_id = self.eos_id #train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id, args.loss) updates, lr, gnorm = create_optimization_updates( cost = self.cost, params = self.params, lr = args.learning_rate, method = args.learning )[:3] train_func = theano.function( inputs = [ self.idxs, self.idys ], outputs = [ self.cost, self.loss, gnorm ], updates = updates ) eval_func = theano.function( inputs = [ self.idxs ], #outputs = self.scores2 outputs = self.scores ) nll_func = theano.function( inputs = [ self.idxs, self.idys ], outputs = [ self.nll, self.mask ] ) say("\tp_norm: {}\n".format( self.get_pnorm_stat() )) result_table = PrettyTable(["Epoch", "dev MAP", "dev MRR", "dev P@1", "dev P@5"] + ["tst MAP", "tst MRR", "tst P@1", "tst P@5"]) unchanged = 0 best_dev = -1 dev_MAP = dev_MRR = dev_P1 = dev_P5 = 0 test_MAP = test_MRR = test_P1 = test_P5 = 0 heldout_PPL = -1 start_time = 0 max_epoch = args.max_epoch for epoch in xrange(max_epoch): unchanged += 1 if unchanged > 8: break start_time = time.time() train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id, bos_id, eos_id, auto_encode=True) N =len(train_batches) train_cost = 0.0 train_loss = 0.0 train_loss2 = 0.0 for i in xrange(N): # get current batch t1, b1, t2 = train_batches[i] if args.use_title: idxs, idys = myio.create_one_batch(t1, t2, padding_id) cur_cost, cur_loss, grad_norm = train_func(idxs, idys) train_cost += cur_cost train_loss += cur_loss train_loss2 += cur_loss / idys.shape[0] if args.use_body: idxs, idys = myio.create_one_batch(b1, t2, padding_id) cur_cost, cur_loss, grad_norm = train_func(idxs, idys) train_cost += cur_cost train_loss += cur_loss train_loss2 += cur_loss / idys.shape[0] if i % 10 == 0: say("\r{}/{}".format(i,N)) if i == N-1: self.dropout.set_value(0.0) if dev is not None: dev_MAP, dev_MRR, dev_P1, dev_P5 = self.evaluate(dev, eval_func) if test is not None: test_MAP, test_MRR, test_P1, test_P5 = self.evaluate(test, eval_func) if heldout is not None: heldout_PPL = self.evaluate_perplexity(heldout, nll_func) if dev_MRR > best_dev: unchanged = 0 best_dev = dev_MRR result_table.add_row( [ epoch ] + [ "%.2f" % x for x in [ dev_MAP, dev_MRR, dev_P1, dev_P5 ] + [ test_MAP, test_MRR, test_P1, test_P5 ] ] ) if args.model: self.save_model(args.model+".pkl.gz") dropout_p = np.float64(args.dropout).astype( theano.config.floatX) self.dropout.set_value(dropout_p) say("\r\n\n") say( ( "Epoch {}\tcost={:.3f}\tloss={:.3f} {:.3f}\t" \ +"\tMRR={:.2f},{:.2f}\tPPL={:.1f}\t|g|={:.3f}\t[{:.3f}m]\n" ).format( epoch, train_cost / (i+1), train_loss / (i+1), train_loss2 / (i+1), dev_MRR, best_dev, heldout_PPL, float(grad_norm), (time.time()-start_time)/60.0 )) say("\tp_norm: {}\n".format( self.get_pnorm_stat() )) say("\n") say("{}".format(result_table)) say("\n")
def ready(self): args = self.args weights = self.weights # len(title) * batch idts = self.idts = T.imatrix() # len(body) * batch idbs = self.idbs = T.imatrix() # num pairs * 3, or num queries * candidate size idps = self.idps = T.imatrix() dropout = self.dropout = theano.shared( np.float64(args.dropout).astype(theano.config.floatX)) dropout_op = self.dropout_op = Dropout(self.dropout) embedding_layer = self.embedding_layer activation = get_activation_by_name(args.activation) n_d = self.n_d = args.hidden_dim n_e = self.n_e = embedding_layer.n_d if args.layer.lower() == "rcnn": LayerType = RCNN elif args.layer.lower() == "lstm": LayerType = LSTM elif args.layer.lower() == "gru": LayerType = GRU depth = self.depth = args.depth layers = self.layers = [] for i in range(depth): if LayerType != RCNN: feature_layer = LayerType(n_in=n_e if i == 0 else n_d, n_out=n_d, activation=activation) else: feature_layer = LayerType(n_in=n_e if i == 0 else n_d, n_out=n_d, activation=activation, order=args.order, mode=args.mode, has_outgate=args.outgate) layers.append(feature_layer) # feature computation starts here # (len*batch)*n_e xt = embedding_layer.forward(idts.ravel()) if weights is not None: xt_w = weights[idts.ravel()].dimshuffle((0, 'x')) xt = xt * xt_w # len*batch*n_e xt = xt.reshape((idts.shape[0], idts.shape[1], n_e)) xt = apply_dropout(xt, dropout) # (len*batch)*n_e xb = embedding_layer.forward(idbs.ravel()) if weights is not None: xb_w = weights[idbs.ravel()].dimshuffle((0, 'x')) xb = xb * xb_w # len*batch*n_e xb = xb.reshape((idbs.shape[0], idbs.shape[1], n_e)) xb = apply_dropout(xb, dropout) prev_ht = self.xt = xt prev_hb = self.xb = xb for i in range(depth): # len*batch*n_d ht = layers[i].forward_all(prev_ht) hb = layers[i].forward_all(prev_hb) prev_ht = ht prev_hb = hb # normalize vectors if args.normalize: ht = self.normalize_3d(ht) hb = self.normalize_3d(hb) say("h_title dtype: {}\n".format(ht.dtype)) self.ht = ht self.hb = hb # average over length, ignore paddings # batch * d if args.average: ht = self.average_without_padding(ht, idts) hb = self.average_without_padding(hb, idbs) else: ht = ht[-1] hb = hb[-1] say("h_avg_title dtype: {}\n".format(ht.dtype)) # batch * d h_final = (ht + hb) * 0.5 h_final = apply_dropout(h_final, dropout) h_final = self.normalize_2d(h_final) self.h_final = h_final say("h_final dtype: {}\n".format(ht.dtype)) # For testing: # first one in batch is query, the rest are candidate questions self.scores = T.dot(h_final[1:], h_final[0]) # For training: xp = h_final[idps.ravel()] xp = xp.reshape((idps.shape[0], idps.shape[1], n_d)) # num query * n_d query_vecs = xp[:, 0, :] # num query pos_scores = T.sum(query_vecs * xp[:, 1, :], axis=1) # num query * candidate size neg_scores = T.sum(query_vecs.dimshuffle((0, 'x', 1)) * xp[:, 2:, :], axis=2) # num query neg_scores = T.max(neg_scores, axis=1) diff = neg_scores - pos_scores + 1.0 loss = T.mean((diff > 0) * diff) self.loss = loss params = [] for l in self.layers: params += l.params self.params = params say("num of parameters: {}\n".format( sum(len(x.get_value(borrow=True).ravel()) for x in params))) l2_reg = None for p in params: if l2_reg is None: l2_reg = p.norm(2) else: l2_reg = l2_reg + p.norm(2) l2_reg = l2_reg * args.l2_reg self.cost = self.loss + l2_reg
def ready(self): generator = self.generator args = self.args weights = self.weights dropout = generator.dropout # len(text) * batch idts = generator.x z = generator.z_pred z = z.dimshuffle((0,1,"x")) # batch * 2 pairs = self.pairs = T.imatrix() # num pairs * 3, or num queries * candidate size triples = self.triples = T.imatrix() embedding_layer = self.embedding_layer activation = get_activation_by_name(args.activation) n_d = self.n_d = args.hidden_dim n_e = self.n_e = embedding_layer.n_d if args.layer.lower() == "rcnn": LayerType = RCNN LayerType2 = ExtRCNN elif args.layer.lower() == "lstm": LayerType = LSTM LayerType2 = ExtLSTM #elif args.layer.lower() == "gru": # LayerType = GRU depth = self.depth = args.depth layers = self.layers = [ ] for i in range(depth): if LayerType != RCNN: feature_layer = LayerType( n_in = n_e if i == 0 else n_d, n_out = n_d, activation = activation ) else: feature_layer = LayerType( n_in = n_e if i == 0 else n_d, n_out = n_d, activation = activation, order = args.order, mode = args.mode, has_outgate = args.outgate ) layers.append(feature_layer) extlayers = self.extlayers = [ ] for i in range(depth): if LayerType != RCNN: feature_layer = LayerType2( n_in = n_e if i == 0 else n_d, n_out = n_d, activation = activation ) else: feature_layer = LayerType2( n_in = n_e if i == 0 else n_d, n_out = n_d, activation = activation, order = args.order, mode = args.mode, has_outgate = args.outgate ) feature_layer.copy_params(layers[i]) extlayers.append(feature_layer) # feature computation starts here xt = generator.word_embs # encode full text into representation prev_ht = self.xt = xt for i in range(depth): # len*batch*n_d ht = layers[i].forward_all(prev_ht) prev_ht = ht # encode selected text into representation prev_htz = self.xt = xt for i in range(depth): # len*batch*n_d htz = extlayers[i].forward_all(prev_htz, z) prev_htz = htz # normalize vectors if args.normalize: ht = self.normalize_3d(ht) htz = self.normalize_3d(htz) say("h_title dtype: {}\n".format(ht.dtype)) self.ht = ht self.htz = htz # average over length, ignore paddings # batch * d if args.average: ht = self.average_without_padding(ht, idts) htz = self.average_without_padding(htz, idts, z) else: ht = ht[-1] htz = htz[-1] say("h_avg_title dtype: {}\n".format(ht.dtype)) # batch * d h_final = apply_dropout(ht, dropout) h_final = self.normalize_2d(h_final) hz_final = apply_dropout(htz, dropout) hz_final = self.normalize_2d(hz_final) self.h_final = h_final self.hz_final = hz_final say("h_final dtype: {}\n".format(ht.dtype)) # For testing: # first one in batch is query, the rest are candidate questions self.scores = T.dot(h_final[1:], h_final[0]) self.scores_z = T.dot(hz_final[1:], hz_final[0]) # For training encoder: xp = h_final[triples.ravel()] xp = xp.reshape((triples.shape[0], triples.shape[1], n_d)) # num query * n_d query_vecs = xp[:,0,:] # num query pos_scores = T.sum(query_vecs*xp[:,1,:], axis=1) # num query * candidate size neg_scores = T.sum(query_vecs.dimshuffle((0,'x',1))*xp[:,2:,:], axis=2) # num query neg_scores = T.max(neg_scores, axis=1) diff = neg_scores - pos_scores + 1.0 hinge_loss = T.mean( (diff>0)*diff ) # For training generator # batch self_cosine_distance = 1.0 - T.sum(hz_final * h_final, axis=1) pair_cosine_distance = 1.0 - T.sum(hz_final * h_final[pairs[:,1]], axis=1) alpha = args.alpha loss_vec = self_cosine_distance*alpha + pair_cosine_distance*(1-alpha) #loss_vec = self_cosine_distance*0.2 + pair_cosine_distance*0.8 zsum = generator.zsum zdiff = generator.zdiff logpz = generator.logpz sfactor = args.sparsity cfactor = args.sparsity * args.coherent scost_vec = zsum*sfactor + zdiff*cfactor # batch cost_vec = loss_vec + scost_vec cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0)) loss = self.loss = T.mean(loss_vec) sparsity_cost = self.sparsity_cost = T.mean(scost_vec) self.obj = loss + sparsity_cost params = [ ] for l in self.layers: params += l.params self.params = params say("num of parameters: {}\n".format( sum(len(x.get_value(borrow=True).ravel()) for x in params) )) l2_reg = None for p in params: if l2_reg is None: l2_reg = T.sum(p**2) #p.norm(2) else: l2_reg = l2_reg + T.sum(p**2) #p.norm(2) l2_reg = l2_reg * args.l2_reg self.l2_cost = l2_reg beta = args.beta self.cost_g = cost_logpz + generator.l2_cost self.cost_e = hinge_loss + loss*beta + l2_reg print "cost dtype", self.cost_g.dtype, self.cost_e.dtype