def main(): print args assert args.embedding, "Pre-trained word embeddings required." embedding_layer = myio.create_embedding_layer(args.embedding) max_len = args.max_len if args.train: train_x, train_y = myio.read_annotations(args.train) train_x = [embedding_layer.map_to_ids(x)[:max_len] for x in train_x] if args.dev: dev_x, dev_y = myio.read_annotations(args.dev) dev_x = [embedding_layer.map_to_ids(x)[:max_len] for x in dev_x] if args.test: test_x, test_y = myio.read_annotations(args.test) test_x = [embedding_layer.map_to_ids(x)[:max_len] for x in test_x] if args.train: model = Model(args=args, embedding_layer=embedding_layer, nclasses=len(train_y[0])) model.ready() #debug_func2 = theano.function( # inputs = [ model.x, model.z ], # outputs = model.generator.logpz # ) #theano.printing.debugprint(debug_func2) #return model.train((train_x, train_y), (dev_x, dev_y) if args.dev else None, (test_x, test_y) if args.test else None)
def main(args): raw_corpus = myio.read_corpus(args.corpus) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d=args.hidden_dim, embs=load_embedding_iterator(args.embeddings) if args.embeddings else None) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, max_len=args.max_seq_len) say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus))) padding_id = embedding_layer.vocab_map["<padding>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.dev: dev_raw = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev_raw, padding_id, pad_left=not args.average, merge=args.merge) if args.test: test_raw = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test_raw, padding_id, pad_left=not args.average, merge=args.merge) if args.train: start_time = time.time() train = myio.read_annotations(args.train) train_batches = myio.create_batches(ids_corpus, train, args.batch_size, padding_id, pad_left=not args.average, merge=args.merge) say("{} to create batches\n".format(time.time() - start_time)) say("{} batches, {} tokens in total, {} triples in total\n".format( len(train_batches), sum(len(x[0].ravel()) for x in train_batches), sum(len(x[1].ravel()) for x in train_batches))) train_batches = None model = Model(args, embedding_layer, weights=weights if args.reweight else None) model.ready() # set parameters using pre-trained network if args.load_pretrain: model.encoder.load_pretrained_parameters(args) model.train(ids_corpus, train, (dev, dev_raw) if args.dev else None, (test, test_raw) if args.test else None)
def main(args): raw_corpus = myio.read_corpus(args.corpus) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d = args.hidden_dim, cut_off = args.cut_off, embs = load_embedding_iterator(args.embeddings) if args.embeddings else None ) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer) say("vocab size={}, corpus size={}\n".format( embedding_layer.n_V, len(raw_corpus) )) padding_id = embedding_layer.vocab_map["<padding>"] bos_id = embedding_layer.vocab_map["<s>"] eos_id = embedding_layer.vocab_map["</s>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.dev: dev = myio.read_annotations(args.dev, K_neg=20, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev, padding_id) if args.test: test = myio.read_annotations(args.test, K_neg=20, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test, padding_id) if args.heldout: with open(args.heldout) as fin: heldout_ids = fin.read().split() heldout_corpus = dict((id, ids_corpus[id]) for id in heldout_ids if id in ids_corpus) train_corpus = dict((id, ids_corpus[id]) for id in ids_corpus if id not in heldout_corpus) heldout = myio.create_batches(heldout_corpus, [ ], args.batch_size, padding_id, bos_id, eos_id, auto_encode=True) heldout = [ myio.create_one_batch(b1, t2, padding_id) for t1, b1, t2 in heldout ] say("heldout examples={}\n".format(len(heldout_corpus))) if args.train: model = Model(args, embedding_layer, weights=weights if args.reweight else None) start_time = time.time() train = myio.read_annotations(args.train) if not args.use_anno: train = [ ] train_batches = myio.create_batches(ids_corpus, train, args.batch_size, model.padding_id, model.bos_id, model.eos_id, auto_encode=True) say("{} to create batches\n".format(time.time()-start_time)) model.ready() model.train( ids_corpus if not args.heldout else train_corpus, train, dev if args.dev else None, test if args.test else None, heldout if args.heldout else None )
def main(args): raw_corpus = myio.read_corpus(args.corpus) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d = args.hidden_dim, embs = load_embedding_iterator(args.embeddings) if args.embeddings else None ) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, max_len=args.max_seq_len) say("vocab size={}, corpus size={}\n".format( embedding_layer.n_V, len(raw_corpus) )) padding_id = embedding_layer.vocab_map["<padding>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.dev: dev_raw = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev_raw, padding_id, pad_left=not args.average, merge=args.merge) if args.test: test_raw = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test_raw, padding_id, pad_left=not args.average, merge=args.merge) if args.train: start_time = time.time() train = myio.read_annotations(args.train) train_batches = myio.create_batches(ids_corpus, train, args.batch_size, padding_id, pad_left = not args.average, merge=args.merge) say("{} to create batches\n".format(time.time()-start_time)) say("{} batches, {} tokens in total, {} triples in total\n".format( len(train_batches), sum(len(x[0].ravel()) for x in train_batches), sum(len(x[1].ravel()) for x in train_batches) )) train_batches = None model = Model(args, embedding_layer, weights=weights if args.reweight else None) model.ready() # set parameters using pre-trained network if args.load_pretrain: model.encoder.load_pretrained_parameters(args) model.train( ids_corpus, train, (dev, dev_raw) if args.dev else None, (test, test_raw) if args.test else None )
def test_basic(): # code adapted from Tao's `rationale.py`: train = 'data/reviews.aspect1.train.txt.gz' train_x, train_y = myio.read_annotations(train) # train_x = [embedding_layer.map_to_ids(x)[:max_len] for x in train_x] dev = 'data/reviews.aspect1.heldout.txt.gz' dev_x, dev_y = myio.read_annotations(dev) # dev_x = [embedding_layer.map_to_ids(x)[:max_len] for x in dev_x] load_rationale = 'data/annotations.json' rationale_data = myio.read_rationales(load_rationale)
def main(): print args embedding_layer = None if args.embedding: assert args.embedding, "Pre-trained word embeddings required." embedding_layer = myio.create_embedding_layer(args.embedding) max_len = args.max_len if args.train: train_x, train_y = myio.read_annotations(args.train) train_words = set([word for x in train_x for word in x]) embedding_layer = EmbeddingLayer(n_d=args.hidden_dimension, vocab=["<unk>", "<padding>"] + list(train_words), oov="<unk>", fix_init_embs=False) train_x = [embedding_layer.map_to_ids(x)[:max_len] for x in train_x] if args.dev: dev_x, dev_y = myio.read_annotations(args.dev) dev_x = [embedding_layer.map_to_ids(x)[:max_len] for x in dev_x] if args.load_rationale: rationale_data = myio.read_rationales(args.load_rationale) for x in rationale_data: x["xids"] = embedding_layer.map_to_ids(x["x"]) if args.train: model = Model(args=args, embedding_layer=embedding_layer, nclasses=len(train_y[0])) model.ready() #debug_func2 = theano.function( # inputs = [ model.x, model.z ], # outputs = model.generator.logpz # ) #theano.printing.debugprint(debug_func2) #return model.train( (train_x, train_y), (dev_x, dev_y) if args.dev else None, None, #(test_x, test_y), rationale_data if args.load_rationale else None)
def run(in_train_file, out_train_file_embedded, word_vectors, max_len): x, y = myio.read_annotations(in_train_file) print('len(x)', len(x)) idx_by_word = {} words = [] # add <unk> and <pad> words.append('<pad>') words.append('<unk>') idx_by_word['<pad>'] = 0 idx_by_word['<unk>'] = 1 for n, ex in enumerate(x): for word in ex[:max_len]: if word not in idx_by_word: idx_by_word[word] = len(idx_by_word) words.append(word) V = len(words) it = myio.load_embedding_iterator(word_vectors) embedding_vals = [None for i in range(V)] for word, vals in it: if word in idx_by_word: idx = idx_by_word[word] nd = len(vals) embedding_vals[idx] = vals embedding = torch.zeros(V, nd) # add unk and pad # well, pad is easy, so add unk # well... lets leave it for the trainer to do this for i, vals in enumerate(embedding_vals): if vals is not None: embedding[i] = vals x_idxes = [] unk_idx = idx_by_word['<unk>'] for n, ex in enumerate(x): num_words = len(ex[:max_len]) idxes = torch.LongTensor(num_words) idxes.fill_(0) for i, word in enumerate(ex[:max_len]): if word in idx_by_word: idx = idx_by_word[word] else: idx = unk_idx idxes[i] = idx # print('idxes.shape', idxes.shape) x_idxes.append(idxes) d = { 'embedding': embedding, 'idx_by_word': idx_by_word, 'words': words, 'x': x, 'y': y, 'x_idxes': x_idxes } with open(out_train_file_embedded, 'wb') as f: pickle.dump(d, f)
def main(): print args assert args.embedding, "Pre-trained word embeddings required." embedding_layer = myio.create_embedding_layer( args.embedding ) max_len = args.max_len if args.train: train_x, train_y = myio.read_annotations(args.train) train_x = [ embedding_layer.map_to_ids(x)[:max_len] for x in train_x ] if args.dev: dev_x, dev_y = myio.read_annotations(args.dev) dev_x = [ embedding_layer.map_to_ids(x)[:max_len] for x in dev_x ] if args.load_rationale: rationale_data = myio.read_rationales(args.load_rationale) for x in rationale_data: x["xids"] = embedding_layer.map_to_ids(x["x"]) if args.train: model = Model( args = args, embedding_layer = embedding_layer, nclasses = len(train_y[0]) ) model.ready() #debug_func2 = theano.function( # inputs = [ model.x, model.z ], # outputs = model.generator.logpz # ) #theano.printing.debugprint(debug_func2) #return model.train( (train_x, train_y), (dev_x, dev_y) if args.dev else None, None, #(test_x, test_y), rationale_data if args.load_rationale else None )
def main(args): raw_corpus = myio.read_corpus(args.corpus) print("raw corpus:", args.corpus, "len:", len(raw_corpus)) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d = args.hidden_dim, cut_off = args.cut_off, embs = None # embs = load_embedding_iterator(args.embeddings) if args.embeddings else None ) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, max_len=args.max_seq_len) myio.say("vocab size={}, corpus size={}\n".format( embedding_layer.n_V, len(raw_corpus) )) padding_id = embedding_layer.vocab_map["<padding>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) # # if args.dev: # dev = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1) # dev = myio.create_eval_batches(ids_corpus, dev, padding_id, pad_left = not args.average) # if args.test: # test = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1) # test = myio.create_eval_batches(ids_corpus, test, padding_id, pad_left = not args.average) if args.train: start_time = time.time() train = myio.read_annotations(args.train) print("training data:", args.train, "len:", len(train)) train_batches = myio.create_batches(ids_corpus, train, args.batch_size, padding_id, pad_left = not args.average) myio.say("{:.2f} secs to create {} batches of size {}\n".format( (time.time()-start_time), len(train_batches), args.batch_size)) myio.say("{} batches, {} tokens in total, {} triples in total\n".format( len(train_batches), sum(len(x[0].ravel())+len(x[1].ravel()) for x in train_batches), sum(len(x[2].ravel()) for x in train_batches) )) # train_batches = None model = Model(args, embedding_layer, weights=weights if args.reweight else None) model.ready() # # set parameters using pre-trained network # if args.load_pretrain: # model.load_pretrained_parameters(args) # model.train( ids_corpus, train, dev = None, # dev if args.dev else None, test = None # test if args.test else None )
def train(self, ids_corpus, train, dev=None, test=None): dropout_prob = np.float64(args.dropout).astype(theano.config.floatX) batch_size = args.batch_size padding_id = self.padding_id #train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id) updates, lr, gnorm = create_optimization_updates( cost = self.cost, params = self.params, lr = args.learning_rate, method = args.learning )[:3] train_func = theano.function( inputs = [ self.idts, self.idbs, self.idps ], outputs = [ self.cost, self.loss, gnorm ], updates = updates ) eval_func = theano.function( inputs = [ self.idts, self.idbs ], outputs = self.scores, on_unused_input='ignore' ) say("\tp_norm: {}\n".format( self.get_pnorm_stat() )) result_table = PrettyTable(["Epoch", "dev MAP", "dev MRR", "dev P@1", "dev P@5"] + ["tst MAP", "tst MRR", "tst P@1", "tst P@5"]) unchanged = 0 best_dev = -1 dev_MAP = dev_MRR = dev_P1 = dev_P5 = 0 test_MAP = test_MRR = test_P1 = test_P5 = 0 start_time = 0 max_epoch = args.max_epoch for epoch in xrange(max_epoch): unchanged += 1 if unchanged > 15: break start_time = time.time() train = myio.read_annotations(args.train) train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id, pad_left = not args.average) N =len(train_batches) train_loss = 0.0 train_cost = 0.0 for i in xrange(N): # get current batch idts, idbs, idps = train_batches[i] cur_cost, cur_loss, grad_norm = train_func(idts, idbs, idps) train_loss += cur_loss train_cost += cur_cost if i % 10 == 0: say("\r{}/{}".format(i,N)) if i == N-1: self.dropout.set_value(0.0) if dev is not None: dev_MAP, dev_MRR, dev_P1, dev_P5 = self.evaluate(dev, eval_func) if test is not None: test_MAP, test_MRR, test_P1, test_P5 = self.evaluate(test, eval_func) if dev_MRR > best_dev: unchanged = 0 best_dev = dev_MRR result_table.add_row( [ epoch ] + [ "%.2f" % x for x in [ dev_MAP, dev_MRR, dev_P1, dev_P5 ] + [ test_MAP, test_MRR, test_P1, test_P5 ] ] ) if args.save_model: self.save_model(args.save_model) dropout_p = np.float64(args.dropout).astype( theano.config.floatX) self.dropout.set_value(dropout_p) say("\r\n\n") say( ( "Epoch {}\tcost={:.3f}\tloss={:.3f}" \ +"\tMRR={:.2f},{:.2f}\t|g|={:.3f}\t[{:.3f}m]\n" ).format( epoch, train_cost / (i+1), train_loss / (i+1), dev_MRR, best_dev, float(grad_norm), (time.time()-start_time)/60.0 )) say("\tp_norm: {}\n".format( self.get_pnorm_stat() )) say("\n") say("{}".format(result_table)) say("\n")
def main(): print 'Starting at: {}\n'.format(datetime.now()) raw_corpus = myio.read_corpus(args.corpus) embedding_layer = create_embedding_layer( n_d=200, embs=load_embedding_iterator(args.embeddings), only_words=False if args.use_embeddings else True, trainable=args.trainable ) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, max_len=args.max_seq_len) print("vocab size={}, corpus size={}\n".format( embedding_layer.n_V, len(raw_corpus) )) padding_id = embedding_layer.vocab_map["<padding>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.layer.lower() == "lstm": from models import LstmQR as Model elif args.layer.lower() in ["bilstm", "bigru"]: from models import BiRNNQR as Model elif args.layer.lower() == "cnn": from models import CnnQR as Model elif args.layer.lower() == "gru": from models import GruQR as Model else: raise Exception("no correct layer given") if args.dev: dev = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev, padding_id, pad_left=False) if args.test: test = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test, padding_id, pad_left=False) model = Model(args, embedding_layer, weights=weights if args.reweight else None) model.ready() print 'total (non) trainable params: ', model.num_parameters() if args.load_pre_trained_part: # need to remove the old assigns to embeddings model.init_assign_ops = model.load_pre_trained_part(args.load_pre_trained_part) print '\nmodel init_assign_ops: {}\n'.format(model.init_assign_ops) if args.train: start_time = time.time() train = myio.read_annotations(args.train) train_batches = myio.create_batches( ids_corpus, train, args.batch_size, padding_id, pad_left=False ) print("{} to create batches\n".format(time.time()-start_time)) print("{} batches, {} tokens in total, {} triples in total\n".format( len(train_batches), sum(len(x[0].ravel())+len(x[1].ravel()) for x in train_batches), sum(len(x[2].ravel()) for x in train_batches) )) model.train_model( ids_corpus, train, dev=dev if args.dev else None, test=test if args.test else None ) print '\nEnded at: {}'.format(datetime.now())
def main(args): raw_corpus = myio.read_corpus(args.corpus, args.translations or None, args.translatable_ids or None, args.generated_questions_train or None) generated_questions_eval = myio.read_generated_questions( args.generated_questions) embedding_layer = None if args.trainable_embeddings == 1: embedding_layer = myio.create_embedding_layer( raw_corpus, n_d=args.hidden_dim, cut_off=args.cut_off, embs=load_embedding_iterator(args.embeddings) if args.embeddings else None, fix_init_embs=False) else: embedding_layer = myio.create_embedding_layer( raw_corpus, n_d=args.hidden_dim, cut_off=args.cut_off, embs=load_embedding_iterator(args.embeddings) if args.embeddings else None) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, max_len=args.max_seq_len, generated_questions=generated_questions_eval) say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus))) padding_id = embedding_layer.vocab_map["<padding>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.dev: # dev = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1) dev = myio.read_annotations(args.dev, K_neg=args.dev_pool_size, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev, padding_id, pad_left=not args.average) if args.test: test = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test, padding_id, pad_left=not args.average) if args.train: start_time = time.time() train = myio.read_annotations( args.train, training_data_percent=args.training_data_percent) train_batches = myio.create_batches(ids_corpus, train, args.batch_size, padding_id, pad_left=not args.average, include_generated_questions=True) say("{} to create batches\n".format(time.time() - start_time)) say("{} batches, {} tokens in total, {} triples in total\n".format( len(train_batches), sum(len(x[0].ravel()) + len(x[1].ravel()) for x in train_batches), sum(len(x[2].ravel()) for x in train_batches))) train_batches = None model = Model(args, embedding_layer, weights=weights if args.reweight else None) # print('args.average: '+args.average) model.ready() # # # set parameters using pre-trained network if args.do_train == 1: if args.load_pretrain: model.load_pretrained_parameters(args) model.train(ids_corpus, train, dev if args.dev else None, test if args.test else None) # AVERAGE THE PREDICTIONS OBTAINED BY RUNNING THE MODEL 10 TIMES if args.do_evaluate == 1: model.load_pretrained_parameters(args) # model.set_model(model.load_model(args.load_pretrain)) for i in range(1): r = model.just_eval(dev if args.dev else None, test if args.test else None) # ANALYZE the results if len(args.analyze_file.strip()) > 0: model.load_pretrained_parameters(args) file_name = args.analyze_file.strip( ) # 'AskUbuntu.Rcnn_analysis3.gt(es)-gt.txt' model.analyze(file_name, embedding_layer, dev)
def main(): print args set_default_rng_seed(args.seed) assert args.embedding, "Pre-trained word embeddings required." embedding_layer = myio.create_embedding_layer(args.embedding) max_len = args.max_len if args.train: train_x, train_y = myio.read_annotations(args.train) if args.debug: len_ = len(train_x) * args.debug len_ = int(len_) train_x = train_x[:len_] train_y = train_y[:len_] print 'train size: ', len(train_x) #, train_x[0], len(train_x[0]) #exit() train_x = [embedding_layer.map_to_ids(x)[:max_len] for x in train_x] if args.dev: dev_x, dev_y = myio.read_annotations(args.dev) if args.debug: len_ = len(dev_x) * args.debug len_ = int(len_) dev_x = dev_x[:len_] dev_x = dev_y[:len_] print 'train size: ', len(train_x) dev_x = [embedding_layer.map_to_ids(x)[:max_len] for x in dev_x] if args.load_rationale: rationale_data = myio.read_rationales(args.load_rationale) for x in rationale_data: x["xids"] = embedding_layer.map_to_ids(x["x"]) #print 'in main: ', args.seed if args.train: model = Model(args=args, embedding_layer=embedding_layer, nclasses=len(train_y[0])) if args.load_model: model.load_model(args.load_model, seed=args.seed, select_all=args.select_all) say("model loaded successfully.\n") else: model.ready() #say(" ready time nedded {} \n".format(time.time()-start_ready_time)) #debug_func2 = theano.function( # inputs = [ model.x, model.z ], # outputs = model.generator.logpz # ) #theano.printing.debugprint(debug_func2) #return model.train( (train_x, train_y), (dev_x, dev_y) if args.dev else None, None, #(test_x, test_y), rationale_data if args.load_rationale else None, trained_max_epochs=args.trained_max_epochs) if args.load_model and not args.dev and not args.train: model = Model(args=args, embedding_layer=embedding_layer, nclasses=-1) model.load_model(args.load_model, seed=args.seed, select_all=args.select_all) say("model loaded successfully.\n") sample_generator = theano.function( inputs=[model.x], outputs=model.z, #updates = model.generator.sample_updates ) sample_encoder = theano.function( inputs=[model.x, model.y, model.z], outputs=[ model.encoder.obj, model.encoder.loss, model.encoder.pred_diff ], #updates = model.generator.sample_updates ) # compile an evaluation function eval_func = theano.function( inputs=[model.x, model.y], outputs=[ model.z, model.encoder.obj, model.encoder.loss, model.encoder.pred_diff ], #updates = model.generator.sample_updates ) debug_func_enc = theano.function( inputs=[model.x, model.y], outputs=[ model.z, model.encoder.obj, model.encoder.loss, model.encoder.pred_diff ], #updates = model.generator.sample_updates ) debug_func_gen = theano.function( inputs=[model.x, model.y], outputs=[ model.z, model.encoder.obj, model.encoder.loss, model.encoder.pred_diff ], #updates = model.generator.sample_updates ) # compile a predictor function pred_func = theano.function( inputs=[model.x], outputs=[model.z, model.encoder.preds], #updates = model.generator.sample_updates ) # batching data padding_id = embedding_layer.vocab_map["<padding>"] if rationale_data is not None: valid_batches_x, valid_batches_y = myio.create_batches( [u["xids"] for u in rationale_data], [u["y"] for u in rationale_data], args.batch, padding_id, sort=False) # disable dropout model.dropout.set_value(0.0) if rationale_data is not None: #model.dropout.set_value(0.0) start_rational_time = time.time() r_mse, r_p1, r_prec1, r_prec2, gen_time, enc_time, prec_cal_time = model.evaluate_rationale( rationale_data, valid_batches_x, valid_batches_y, sample_generator, sample_encoder, eval_func) #valid_batches_y, eval_func) #model.dropout.set_value(dropout_prob) #say(("\ttest rationale mser={:.4f} p[1]r={:.2f} prec1={:.4f}" + # " prec2={:.4f} generator time={:.4f} encoder time={:.4f} total test time={:.4f}\n").format( # r_mse, # r_p1, # r_prec1, # r_prec2, # gen_time, # enc_time, # time.time() - start_rational_time #)) data = str('%.5f' % r_mse) + "\t" + str( '%4.2f' % r_p1) + "\t" + str('%4.4f' % r_prec1) + "\t" + str( '%4.4f' % r_prec2) + "\t" + str('%4.2f' % gen_time) + "\t" + str( '%4.2f' % enc_time) + "\t" + str( '%4.2f' % prec_cal_time) + "\t" + str( '%4.2f' % (time.time() - start_rational_time) ) + "\t" + str(args.sparsity) + "\t" + str( args.coherent) + "\t" + str( args.max_epochs) + "\t" + str( args.cur_epoch) with open(args.graph_data_path, 'a') as g_f: print 'writning to file: ', data g_f.write(data + "\n")
def train(self, ids_corpus, train, dev=None, test=None): dropout_prob = np.float64(args.dropout).astype(theano.config.floatX) batch_size = args.batch_size padding_id = self.padding_id #train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id) if dev is not None: dev, dev_raw = dev if test is not None: test, test_raw = test if args.joint: updates_e, lr_e, gnorm_e = create_optimization_updates( cost=self.encoder.cost_e, #self.encoder.cost, params=self.encoder.params, lr=args.learning_rate * 0.1, method=args.learning)[:3] else: updates_e = {} updates_g, lr_g, gnorm_g = create_optimization_updates( cost=self.encoder.cost_g, params=self.generator.params, lr=args.learning_rate, method=args.learning)[:3] train_func = theano.function( inputs = [ self.x, self.triples, self.pairs ], outputs = [ self.encoder.obj, self.encoder.loss, \ self.encoder.sparsity_cost, self.generator.p1, gnorm_g ], # updates = updates_g.items() + updates_e.items() + self.generator.sample_updates, updates = collections.OrderedDict(list(updates_g.items()) + list(updates_e.items()) + list(self.generator.sample_updates.items())), #no_default_updates = True, on_unused_input= "ignore" ) eval_func = theano.function(inputs=[self.x], outputs=self.encoder.scores) eval_func2 = theano.function( inputs=[self.x], outputs=[self.encoder.scores_z, self.generator.p1, self.z], updates=self.generator.sample_updates, #no_default_updates = True ) say("\tp_norm: {}\n".format(self.get_pnorm_stat(self.encoder.params))) say("\tp_norm: {}\n".format(self.get_pnorm_stat( self.generator.params))) result_table = PrettyTable( ["Epoch", "dev MAP", "dev MRR", "dev P@1", "dev P@5"] + ["tst MAP", "tst MRR", "tst P@1", "tst P@5"]) last_train_avg_cost = None tolerance = 0.5 + 1e-3 unchanged = 0 best_dev = -1 dev_MAP = dev_MRR = dev_P1 = dev_P5 = 0 test_MAP = test_MRR = test_P1 = test_P5 = 0 start_time = 0 max_epoch = args.max_epoch for epoch in range(max_epoch): unchanged += 1 if unchanged > 20: break start_time = time.time() train = myio.read_annotations(args.train) train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id, pad_left=not args.average, merge=args.merge) N = len(train_batches) more = True param_bak = [p.get_value(borrow=False) for p in self.params] while more: train_loss = 0.0 train_cost = 0.0 train_scost = 0.0 train_p1 = 0.0 for i in range(N): # get current batch idts, triples, pairs = train_batches[i] cur_cost, cur_loss, cur_scost, cur_p1, gnormg = train_func( idts, triples, pairs) train_loss += cur_loss train_cost += cur_cost train_scost += cur_scost train_p1 += cur_p1 if i % 10 == 0: say("\r{}/{} {:.3f}".format(i, N, train_p1 / (i + 1))) cur_train_avg_cost = train_cost / N more = False if last_train_avg_cost is not None: if cur_train_avg_cost > last_train_avg_cost * (1 + tolerance): more = True say("\nTrain cost {} --> {}\n".format( last_train_avg_cost, cur_train_avg_cost)) if more: lr_val = lr_g.get_value() * 0.5 if lr_val < 1e-5: return lr_val = np.float64(lr_val).astype(theano.config.floatX) lr_g.set_value(lr_val) lr_e.set_value(lr_val) say("Decrease learning rate to {}\n".format(float(lr_val))) for p, v in zip(self.params, param_bak): p.set_value(v) continue last_train_avg_cost = cur_train_avg_cost say("\r\n\n") say( ( "Epoch {} cost={:.3f} loss={:.3f} scost={:.3f}" \ +" P[1]={:.3f} |g|={:.3f}\t[{:.3f}m]\n" ).format( epoch, train_cost / N, train_loss / N, train_scost / N, train_p1 / N, float(gnormg), (time.time()-start_time)/60.0 )) say("\tp_norm: {}\n".format( self.get_pnorm_stat(self.encoder.params))) say("\tp_norm: {}\n".format( self.get_pnorm_stat(self.generator.params))) self.dropout.set_value(0.0) if dev is not None: full_MAP, full_MRR, full_P1, full_P5 = self.evaluate( dev, eval_func) dev_MAP, dev_MRR, dev_P1, dev_P5, dev_PZ1, dev_PT = self.evaluate_z( dev, dev_raw, ids_corpus, eval_func2) if test is not None: test_MAP, test_MRR, test_P1, test_P5, test_PZ1, test_PT = \ self.evaluate_z(test, test_raw, ids_corpus, eval_func2) if dev_MAP > best_dev: best_dev = dev_MAP unchanged = 0 say("\n") say(" fMAP={:.2f} fMRR={:.2f} fP1={:.2f} fP5={:.2f}\n".format( full_MAP, full_MRR, full_P1, full_P5)) say("\n") say((" dMAP={:.2f} dMRR={:.2f} dP1={:.2f} dP5={:.2f}" + " dP[1]={:.3f} d%T={:.3f} best_dev={:.2f}\n").format( dev_MAP, dev_MRR, dev_P1, dev_P5, dev_PZ1, dev_PT, best_dev)) result_table.add_row([epoch] + [ "%.2f" % x for x in [dev_MAP, dev_MRR, dev_P1, dev_P5] + [test_MAP, test_MRR, test_P1, test_P5] ]) if unchanged == 0: say("\n") say((" tMAP={:.2f} tMRR={:.2f} tP1={:.2f} tP5={:.2f}" + " tP[1]={:.3f} t%T={:.3f}\n").format( test_MAP, test_MRR, test_P1, test_P5, test_PZ1, test_PT)) if args.dump_rationale: self.evaluate_z(dev + test, dev_raw + test_raw, ids_corpus, eval_func2, args.dump_rationale) #if args.save_model: # self.save_model(args.save_model) dropout_p = np.float64(args.dropout).astype( theano.config.floatX) self.dropout.set_value(dropout_p) say("\n") say("{}".format(result_table)) say("\n") if train_p1 / N <= 1e-4 or train_p1 / N + 1e-4 >= 1.0: break
argparser.add_argument("--results_file", type=str, default="") # to write in argparser.add_argument("--layer", type=str, default="lstm") args = argparser.parse_args() print '\n', args, '\n' with tf.Session() as sess: myqrapi = QRAPI(args.model, args.corpus, args.embeddings, sess, args.layer) raw_corpus = myio.read_corpus(args.corpus) embedding_layer = myqrapi.model.embedding_layer ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, max_len=100) test = myio.read_annotations(args.test_file, K_neg=-1, prune_pos_cnt=-1) test = create_eval_batches(ids_corpus, test, myqrapi.model.padding_id, pad_left=not myqrapi.model.args.average) testmap, testmrr, testpat1, testpat5, rank_labels, rank_ids, qids, rank_scores = myqrapi.evaluate( test, sess) if args.full_results_file: with open(args.full_results_file, 'w') as f: for i, (_, _, labels, pid, qids) in enumerate(test): print_qids_similar = [ x for x, l in zip(qids, labels) if l == 1 ]
def main(args): raw_corpus = myio.read_corpus(args.corpus) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d=args.hidden_dim, cut_off=args.cut_off, embs=load_embedding_iterator(args.embeddings) if args.embeddings else None) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer) say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus))) padding_id = embedding_layer.vocab_map["<padding>"] bos_id = embedding_layer.vocab_map["<s>"] eos_id = embedding_layer.vocab_map["</s>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.dev: dev = myio.read_annotations(args.dev, K_neg=20, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev, padding_id) if args.test: test = myio.read_annotations(args.test, K_neg=20, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test, padding_id) if args.heldout: with open(args.heldout) as fin: heldout_ids = fin.read().split() heldout_corpus = dict( (id, ids_corpus[id]) for id in heldout_ids if id in ids_corpus) train_corpus = dict((id, ids_corpus[id]) for id in ids_corpus if id not in heldout_corpus) heldout = myio.create_batches(heldout_corpus, [], args.batch_size, padding_id, bos_id, eos_id, auto_encode=True) heldout = [ myio.create_one_batch(b1, t2, padding_id) for t1, b1, t2 in heldout ] say("heldout examples={}\n".format(len(heldout_corpus))) if args.train: model = Model(args, embedding_layer, weights=weights if args.reweight else None) start_time = time.time() train = myio.read_annotations(args.train) if not args.use_anno: train = [] train_batches = myio.create_batches(ids_corpus, train, args.batch_size, model.padding_id, model.bos_id, model.eos_id, auto_encode=True) say("{} to create batches\n".format(time.time() - start_time)) model.ready() model.train(ids_corpus if not args.heldout else train_corpus, train, dev if args.dev else None, test if args.test else None, heldout if args.heldout else None)
def main(): print(args) assert args.embedding, "Pre-trained word embeddings required." embedding_layer = myio.create_embedding_layer(args.embedding) max_len = args.max_len if args.train: train_x, train_y = myio.read_annotations(args.train) train_x = [embedding_layer.map_to_ids(x)[:max_len] for x in train_x] if args.dev: dev_x, dev_y = myio.read_annotations(args.dev) dev_x = [embedding_layer.map_to_ids(x)[:max_len] for x in dev_x] if args.load_rationale: rationale_data = myio.read_rationales(args.load_rationale) for x in rationale_data: x["xids"] = embedding_layer.map_to_ids(x["x"]) if args.train: model = Model(args=args, embedding_layer=embedding_layer, nclasses=len(train_y[0])) model.ready() model.train( (train_x, train_y), (dev_x, dev_y) if args.dev else None, None, #(test_x, test_y), rationale_data if args.load_rationale else None) if args.load_model and args.dev and not args.train: model = Model(args=None, embedding_layer=embedding_layer, nclasses=-1) model.load_model(args.load_model) say("model loaded successfully.\n") # compile an evaluation function eval_func = theano.function( inputs=[model.x, model.y], outputs=[ model.z, model.generator.obj, model.generator.loss, model.encoder.pred_diff ], givens={model.z: model.generator.z_pred}, ) # compile a predictor function pred_func = theano.function( inputs=[model.x], outputs=[model.z, model.encoder.preds], givens={model.z: model.generator.z_pred}, ) # batching data padding_id = embedding_layer.vocab_map["<padding>"] dev_batches_x, dev_batches_y = myio.create_batches( dev_x, dev_y, args.batch, padding_id) # disable dropout model.dropout.set_value(0.0) dev_obj, dev_loss, dev_diff, dev_p1 = model.evaluate_data( dev_batches_x, dev_batches_y, eval_func, sampling=True) say("{} {} {} {}\n".format(dev_obj, dev_loss, dev_diff, dev_p1))
def main(): print args assert args.embedding, "Pre-trained word embeddings required." embedding_layer = myio.create_embedding_layer( args.embedding ) max_len = args.max_len if args.train: train_x, train_y = myio.read_annotations(args.train) train_x = [ embedding_layer.map_to_ids(x)[:max_len] for x in train_x ] if args.dev: dev_x, dev_y = myio.read_annotations(args.dev) dev_x = [ embedding_layer.map_to_ids(x)[:max_len] for x in dev_x ] if args.load_rationale: rationale_data = myio.read_rationales(args.load_rationale) for x in rationale_data: x["xids"] = embedding_layer.map_to_ids(x["x"]) if args.train: model = Model( args = args, embedding_layer = embedding_layer, nclasses = len(train_y[0]) ) model.ready() #debug_func2 = theano.function( # inputs = [ model.x, model.z ], # outputs = model.generator.logpz # ) #theano.printing.debugprint(debug_func2) #return model.train( (train_x, train_y), (dev_x, dev_y) if args.dev else None, None, #(test_x, test_y), rationale_data if args.load_rationale else None ) if args.load_model and args.dev and not args.train: model = Model( args = None, embedding_layer = embedding_layer, nclasses = -1 ) model.load_model(args.load_model) say("model loaded successfully.\n") # compile an evaluation function eval_func = theano.function( inputs = [ model.x, model.y ], outputs = [ model.z, model.encoder.obj, model.encoder.loss, model.encoder.pred_diff ], updates = model.generator.sample_updates ) # compile a predictor function pred_func = theano.function( inputs = [ model.x ], outputs = [ model.z, model.encoder.preds ], updates = model.generator.sample_updates ) # batching data padding_id = embedding_layer.vocab_map["<padding>"] dev_batches_x, dev_batches_y = myio.create_batches( dev_x, dev_y, args.batch, padding_id ) # disable dropout model.dropout.set_value(0.0) dev_obj, dev_loss, dev_diff, dev_p1 = model.evaluate_data( dev_batches_x, dev_batches_y, eval_func, sampling=True) say("{} {} {} {}\n".format(dev_obj, dev_loss, dev_diff, dev_p1))
def train(self, ids_corpus, train, dev=None, test=None): dropout_prob = np.float64(args.dropout).astype(theano.config.floatX) batch_size = args.batch_size padding_id = self.padding_id #train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id) updates, lr, gnorm = create_optimization_updates( cost=self.cost, params=self.params, lr=args.learning_rate, method=args.learning)[:3] train_func = theano.function(inputs=[self.idts, self.idbs, self.idps], outputs=[self.cost, self.loss, gnorm], updates=updates) eval_func = theano.function(inputs=[self.idts, self.idbs], outputs=self.scores, on_unused_input='ignore') say("\tp_norm: {}\n".format(self.get_pnorm_stat())) result_table = PrettyTable( ["Epoch", "dev MAP", "dev MRR", "dev P@1", "dev P@5"] + ["tst MAP", "tst MRR", "tst P@1", "tst P@5"]) unchanged = 0 best_dev = -1 dev_MAP = dev_MRR = dev_P1 = dev_P5 = 0 test_MAP = test_MRR = test_P1 = test_P5 = 0 start_time = 0 max_epoch = args.max_epoch for epoch in xrange(max_epoch): unchanged += 1 if unchanged > 15: break start_time = time.time() train = myio.read_annotations(args.train) train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id, pad_left=not args.average) N = len(train_batches) train_loss = 0.0 train_cost = 0.0 for i in xrange(N): # get current batch idts, idbs, idps = train_batches[i] cur_cost, cur_loss, grad_norm = train_func(idts, idbs, idps) train_loss += cur_loss train_cost += cur_cost if i % 10 == 0: say("\r{}/{}".format(i, N)) if i == N - 1: self.dropout.set_value(0.0) if dev is not None: dev_MAP, dev_MRR, dev_P1, dev_P5 = self.evaluate( dev, eval_func) if test is not None: test_MAP, test_MRR, test_P1, test_P5 = self.evaluate( test, eval_func) if dev_MRR > best_dev: unchanged = 0 best_dev = dev_MRR result_table.add_row([epoch] + [ "%.2f" % x for x in [dev_MAP, dev_MRR, dev_P1, dev_P5] + [test_MAP, test_MRR, test_P1, test_P5] ]) if args.save_model: self.save_model(args.save_model) dropout_p = np.float64(args.dropout).astype( theano.config.floatX) self.dropout.set_value(dropout_p) say("\r\n\n") say( ( "Epoch {}\tcost={:.3f}\tloss={:.3f}" \ +"\tMRR={:.2f},{:.2f}\t|g|={:.3f}\t[{:.3f}m]\n" ).format( epoch, train_cost / (i+1), train_loss / (i+1), dev_MRR, best_dev, float(grad_norm), (time.time()-start_time)/60.0 )) say("\tp_norm: {}\n".format(self.get_pnorm_stat())) say("\n") say("{}".format(result_table)) say("\n")
def train(self, ids_corpus, train, dev=None, test=None): dropout_prob = np.float64(args.dropout).astype(theano.config.floatX) batch_size = args.batch_size padding_id = self.padding_id #train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id) if dev is not None: dev, dev_raw = dev if test is not None: test, test_raw = test if args.joint: updates_e, lr_e, gnorm_e = create_optimization_updates( cost = self.encoder.cost_e, #self.encoder.cost, params = self.encoder.params, lr = args.learning_rate*0.1, method = args.learning )[:3] else: updates_e = {} updates_g, lr_g, gnorm_g = create_optimization_updates( cost = self.encoder.cost_g, params = self.generator.params, lr = args.learning_rate, method = args.learning )[:3] train_func = theano.function( inputs = [ self.x, self.triples, self.pairs ], outputs = [ self.encoder.obj, self.encoder.loss, \ self.encoder.sparsity_cost, self.generator.p1, gnorm_g ], updates = updates_g.items() + updates_e.items() + self.generator.sample_updates, #no_default_updates = True, on_unused_input= "ignore" ) eval_func = theano.function( inputs = [ self.x ], outputs = self.encoder.scores ) eval_func2 = theano.function( inputs = [ self.x ], outputs = [ self.encoder.scores_z, self.generator.p1, self.z ], updates = self.generator.sample_updates, #no_default_updates = True ) say("\tp_norm: {}\n".format( self.get_pnorm_stat(self.encoder.params) )) say("\tp_norm: {}\n".format( self.get_pnorm_stat(self.generator.params) )) result_table = PrettyTable(["Epoch", "dev MAP", "dev MRR", "dev P@1", "dev P@5"] + ["tst MAP", "tst MRR", "tst P@1", "tst P@5"]) last_train_avg_cost = None tolerance = 0.5 + 1e-3 unchanged = 0 best_dev = -1 dev_MAP = dev_MRR = dev_P1 = dev_P5 = 0 test_MAP = test_MRR = test_P1 = test_P5 = 0 start_time = 0 max_epoch = args.max_epoch for epoch in xrange(max_epoch): unchanged += 1 if unchanged > 20: break start_time = time.time() train = myio.read_annotations(args.train) train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id, pad_left=not args.average, merge=args.merge) N =len(train_batches) more = True param_bak = [ p.get_value(borrow=False) for p in self.params ] while more: train_loss = 0.0 train_cost = 0.0 train_scost = 0.0 train_p1 = 0.0 for i in xrange(N): # get current batch idts, triples, pairs = train_batches[i] cur_cost, cur_loss, cur_scost, cur_p1, gnormg = train_func(idts, triples, pairs) train_loss += cur_loss train_cost += cur_cost train_scost += cur_scost train_p1 += cur_p1 if i % 10 == 0: say("\r{}/{} {:.3f}".format(i,N,train_p1/(i+1))) cur_train_avg_cost = train_cost / N more = False if last_train_avg_cost is not None: if cur_train_avg_cost > last_train_avg_cost*(1+tolerance): more = True say("\nTrain cost {} --> {}\n".format( last_train_avg_cost, cur_train_avg_cost )) if more: lr_val = lr_g.get_value()*0.5 if lr_val < 1e-5: return lr_val = np.float64(lr_val).astype(theano.config.floatX) lr_g.set_value(lr_val) lr_e.set_value(lr_val) say("Decrease learning rate to {}\n".format(float(lr_val))) for p, v in zip(self.params, param_bak): p.set_value(v) continue last_train_avg_cost = cur_train_avg_cost say("\r\n\n") say( ( "Epoch {} cost={:.3f} loss={:.3f} scost={:.3f}" \ +" P[1]={:.3f} |g|={:.3f}\t[{:.3f}m]\n" ).format( epoch, train_cost / N, train_loss / N, train_scost / N, train_p1 / N, float(gnormg), (time.time()-start_time)/60.0 )) say("\tp_norm: {}\n".format( self.get_pnorm_stat(self.encoder.params) )) say("\tp_norm: {}\n".format( self.get_pnorm_stat(self.generator.params) )) self.dropout.set_value(0.0) if dev is not None: full_MAP, full_MRR, full_P1, full_P5 = self.evaluate(dev, eval_func) dev_MAP, dev_MRR, dev_P1, dev_P5, dev_PZ1, dev_PT = self.evaluate_z(dev, dev_raw, ids_corpus, eval_func2) if test is not None: test_MAP, test_MRR, test_P1, test_P5, test_PZ1, test_PT = \ self.evaluate_z(test, test_raw, ids_corpus, eval_func2) if dev_MAP > best_dev: best_dev = dev_MAP unchanged = 0 say("\n") say(" fMAP={:.2f} fMRR={:.2f} fP1={:.2f} fP5={:.2f}\n".format( full_MAP, full_MRR, full_P1, full_P5 )) say("\n") say((" dMAP={:.2f} dMRR={:.2f} dP1={:.2f} dP5={:.2f}" + " dP[1]={:.3f} d%T={:.3f} best_dev={:.2f}\n").format( dev_MAP, dev_MRR, dev_P1, dev_P5, dev_PZ1, dev_PT, best_dev )) result_table.add_row( [ epoch ] + [ "%.2f" % x for x in [ dev_MAP, dev_MRR, dev_P1, dev_P5 ] + [ test_MAP, test_MRR, test_P1, test_P5 ] ] ) if unchanged == 0: say("\n") say((" tMAP={:.2f} tMRR={:.2f} tP1={:.2f} tP5={:.2f}" + " tP[1]={:.3f} t%T={:.3f}\n").format( test_MAP, test_MRR, test_P1, test_P5, test_PZ1, test_PT )) if args.dump_rationale: self.evaluate_z(dev+test, dev_raw+test_raw, ids_corpus, eval_func2, args.dump_rationale) #if args.save_model: # self.save_model(args.save_model) dropout_p = np.float64(args.dropout).astype( theano.config.floatX) self.dropout.set_value(dropout_p) say("\n") say("{}".format(result_table)) say("\n") if train_p1/N <= 1e-4 or train_p1/N+1e-4 >= 1.0: break