def create_batches(df, ids_corpus, data_type, batch_size, padding_id): df = df[df['type'] == data_type] data_ids = df['id'].values # NO SHUFFLING FOR EVALUATION TO PRINT IN THE SAME ORDER N = len(data_ids) cnt = 0 titles, bodies, tag_labels = [], [], [] batches = [] ids = [] for u in xrange(N): i = u q_id = data_ids[i] title, body, tag = ids_corpus[str(q_id)] # tag is boolean vector cnt += 1 titles.append(title) bodies.append(body) tag_labels.append(tag) ids.append(q_id) if cnt == batch_size or u == N - 1: titles, bodies, tag_labels = myio.create_one_batch( titles, bodies, tag_labels, padding_id) batches.append((ids, titles, bodies, tag_labels)) titles, bodies, tag_labels = [], [], [] cnt = 0 ids = [] return batches
def rank(self, query): model = self.model emb = model.embedding_layer args = model.args padding_id = model.padding_id score_func = self.score_func if isinstance(query, str) or isinstance(query, unicode): query = json.loads(query) p = query["query"].strip().split() lst_questions = [emb.map_to_ids(p, filter_oov=True)] for q in query["candidates"]: q = q.strip().split() lst_questions.append(emb.map_to_ids(q, filter_oov=True)) batch, _ = myio.create_one_batch(lst_questions, lst_questions, padding_id, not args.average) scores = score_func(batch, batch) scores = [x for x in scores] assert len(scores) == len(batch) - 1 if ("BM25" in query) and ("ratio" in query): BM25 = query["BM25"] ratio = query["ratio"] assert len(BM25) == len(scores) assert ratio >= 0 and ratio <= 1.0 scores = [ x * (1 - ratio) + y * ratio for x, y in zip(scores, BM25) ] ranks = sorted(range(len(scores)), key=lambda i: -scores[i]) return {"ranks": ranks, "scores": scores}
def rank(self, query): model = self.model emb = model.embedding_layer args = model.args padding_id = model.padding_id score_func = self.score_func if isinstance(query, str) or isinstance(query, unicode): query = json.loads(query) p = query["query"].strip().split() lst_questions = [ emb.map_to_ids(p, filter_oov=True) ] for q in query["candidates"]: q = q.strip().split() lst_questions.append( emb.map_to_ids(q, filter_oov=True) ) batch, _ = myio.create_one_batch(lst_questions, lst_questions, padding_id, not args.average ) scores = score_func(batch, batch) scores = [ x for x in scores ] assert len(scores) == len(batch)-1 if ("BM25" in query) and ("ratio" in query): BM25 = query["BM25"] ratio = query["ratio"] assert len(BM25) == len(scores) assert ratio >= 0 and ratio <= 1.0 scores = [ x*(1-ratio)+y*ratio for x,y in zip(scores, BM25) ] ranks = sorted(range(len(scores)), key=lambda i: -scores[i]) return { "ranks": ranks, "scores": scores }
def create_eval_batches(ids_corpus, data, padding_id, N_neg=20): lst = [] def transform(counter, x, length): return ((counter - 1) * length) + x for pid, qids, qlabels in data: titles = [] bodies = [] tag_labels = [] cnt_q = 0 tuples = [] for id in [pid]+qids: cnt_q += 1 title, body, tag = ids_corpus[str(id)] titles.append(title) bodies.append(body) tag_labels.append(tag) q_positive_ids = [transform(cnt_q, idx, tag.shape[0]) for idx, label in enumerate(tag) if label == 1] q_negative_ids = [transform(cnt_q, idx, tag.shape[0]) for idx, label in enumerate(tag) if label == 0] np.random.shuffle(q_negative_ids) q_negative_ids = q_negative_ids[:N_neg] # consider only 20 negatives tuples += [[p_id] + q_negative_ids for p_id in q_positive_ids] tuples = myio.create_hinge_batch(tuples) titles, bodies, tag_labels = myio.create_one_batch(titles, bodies, tag_labels, padding_id) lst.append((titles, bodies, np.array(qlabels, dtype="int32"), tag_labels, tuples, pid, qids)) return lst
def main(args): raw_corpus = myio.read_corpus(args.corpus) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d = args.hidden_dim, cut_off = args.cut_off, embs = load_embedding_iterator(args.embeddings) if args.embeddings else None ) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer) say("vocab size={}, corpus size={}\n".format( embedding_layer.n_V, len(raw_corpus) )) padding_id = embedding_layer.vocab_map["<padding>"] bos_id = embedding_layer.vocab_map["<s>"] eos_id = embedding_layer.vocab_map["</s>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.dev: dev = myio.read_annotations(args.dev, K_neg=20, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev, padding_id) if args.test: test = myio.read_annotations(args.test, K_neg=20, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test, padding_id) if args.heldout: with open(args.heldout) as fin: heldout_ids = fin.read().split() heldout_corpus = dict((id, ids_corpus[id]) for id in heldout_ids if id in ids_corpus) train_corpus = dict((id, ids_corpus[id]) for id in ids_corpus if id not in heldout_corpus) heldout = myio.create_batches(heldout_corpus, [ ], args.batch_size, padding_id, bos_id, eos_id, auto_encode=True) heldout = [ myio.create_one_batch(b1, t2, padding_id) for t1, b1, t2 in heldout ] say("heldout examples={}\n".format(len(heldout_corpus))) if args.train: model = Model(args, embedding_layer, weights=weights if args.reweight else None) start_time = time.time() train = myio.read_annotations(args.train) if not args.use_anno: train = [ ] train_batches = myio.create_batches(ids_corpus, train, args.batch_size, model.padding_id, model.bos_id, model.eos_id, auto_encode=True) say("{} to create batches\n".format(time.time()-start_time)) model.ready() model.train( ids_corpus if not args.heldout else train_corpus, train, dev if args.dev else None, test if args.test else None, heldout if args.heldout else None )
def create_eval_batches(ids_corpus, data, padding_id, pad_left): lst = [] for pid, qids, qlabels in data: titles = [] bodies = [] for id in [pid] + qids: t, b = ids_corpus[id] titles.append(t) bodies.append(b) titles, bodies = myio.create_one_batch(titles, bodies, padding_id, pad_left) lst.append((titles, bodies, np.array(qlabels, dtype="int32"), pid, qids)) return lst
def evaluate(self, data, eval_func): res = [] for t, b, labels in data: idts, idbs = myio.create_one_batch(t, b, self.padding_id) scores = eval_func(idts) #assert len(scores) == len(labels) ranks = (-scores).argsort() ranked_labels = labels[ranks] res.append(ranked_labels) e = Evaluation(res) MAP = e.MAP() * 100 MRR = e.MRR() * 100 P1 = e.Precision(1) * 100 P5 = e.Precision(5) * 100 return MAP, MRR, P1, P5
def evaluate(self, data, eval_func): res = [ ] for t, b, labels in data: idts, idbs = myio.create_one_batch(t, b, self.padding_id) scores = eval_func(idts) #assert len(scores) == len(labels) ranks = (-scores).argsort() ranked_labels = labels[ranks] res.append(ranked_labels) e = Evaluation(res) MAP = e.MAP()*100 MRR = e.MRR()*100 P1 = e.Precision(1)*100 P5 = e.Precision(5)*100 return MAP, MRR, P1, P5
def main(args): raw_corpus = myio.read_corpus(args.corpus) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d=args.hidden_dim, cut_off=args.cut_off, embs=load_embedding_iterator(args.embeddings) if args.embeddings else None) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer) say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus))) padding_id = embedding_layer.vocab_map["<padding>"] bos_id = embedding_layer.vocab_map["<s>"] eos_id = embedding_layer.vocab_map["</s>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.dev: dev = myio.read_annotations(args.dev, K_neg=20, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev, padding_id) if args.test: test = myio.read_annotations(args.test, K_neg=20, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test, padding_id) if args.heldout: with open(args.heldout) as fin: heldout_ids = fin.read().split() heldout_corpus = dict( (id, ids_corpus[id]) for id in heldout_ids if id in ids_corpus) train_corpus = dict((id, ids_corpus[id]) for id in ids_corpus if id not in heldout_corpus) heldout = myio.create_batches(heldout_corpus, [], args.batch_size, padding_id, bos_id, eos_id, auto_encode=True) heldout = [ myio.create_one_batch(b1, t2, padding_id) for t1, b1, t2 in heldout ] say("heldout examples={}\n".format(len(heldout_corpus))) if args.train: model = Model(args, embedding_layer, weights=weights if args.reweight else None) start_time = time.time() train = myio.read_annotations(args.train) if not args.use_anno: train = [] train_batches = myio.create_batches(ids_corpus, train, args.batch_size, model.padding_id, model.bos_id, model.eos_id, auto_encode=True) say("{} to create batches\n".format(time.time() - start_time)) model.ready() model.train(ids_corpus if not args.heldout else train_corpus, train, dev if args.dev else None, test if args.test else None, heldout if args.heldout else None)
def train(self, ids_corpus, train, dev=None, test=None, heldout=None): args = self.args dropout_prob = np.float64(args.dropout).astype(theano.config.floatX) batch_size = args.batch_size padding_id = self.padding_id bos_id = self.bos_id eos_id = self.eos_id #train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id, args.loss) updates, lr, gnorm = create_optimization_updates( cost=self.cost, params=self.params, lr=args.learning_rate, method=args.learning)[:3] train_func = theano.function(inputs=[self.idxs, self.idys], outputs=[self.cost, self.loss, gnorm], updates=updates) eval_func = theano.function( inputs=[self.idxs], #outputs = self.scores2 outputs=self.scores) nll_func = theano.function(inputs=[self.idxs, self.idys], outputs=[self.nll, self.mask]) say("\tp_norm: {}\n".format(self.get_pnorm_stat())) result_table = PrettyTable( ["Epoch", "dev MAP", "dev MRR", "dev P@1", "dev P@5"] + ["tst MAP", "tst MRR", "tst P@1", "tst P@5"]) unchanged = 0 best_dev = -1 dev_MAP = dev_MRR = dev_P1 = dev_P5 = 0 test_MAP = test_MRR = test_P1 = test_P5 = 0 heldout_PPL = -1 start_time = 0 max_epoch = args.max_epoch for epoch in xrange(max_epoch): unchanged += 1 if unchanged > 8: break start_time = time.time() train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id, bos_id, eos_id, auto_encode=True) N = len(train_batches) train_cost = 0.0 train_loss = 0.0 train_loss2 = 0.0 for i in xrange(N): # get current batch t1, b1, t2 = train_batches[i] if args.use_title: idxs, idys = myio.create_one_batch(t1, t2, padding_id) cur_cost, cur_loss, grad_norm = train_func(idxs, idys) train_cost += cur_cost train_loss += cur_loss train_loss2 += cur_loss / idys.shape[0] if args.use_body: idxs, idys = myio.create_one_batch(b1, t2, padding_id) cur_cost, cur_loss, grad_norm = train_func(idxs, idys) train_cost += cur_cost train_loss += cur_loss train_loss2 += cur_loss / idys.shape[0] if i % 10 == 0: say("\r{}/{}".format(i, N)) if i == N - 1: self.dropout.set_value(0.0) if dev is not None: dev_MAP, dev_MRR, dev_P1, dev_P5 = self.evaluate( dev, eval_func) if test is not None: test_MAP, test_MRR, test_P1, test_P5 = self.evaluate( test, eval_func) if heldout is not None: heldout_PPL = self.evaluate_perplexity( heldout, nll_func) if dev_MRR > best_dev: unchanged = 0 best_dev = dev_MRR result_table.add_row([epoch] + [ "%.2f" % x for x in [dev_MAP, dev_MRR, dev_P1, dev_P5] + [test_MAP, test_MRR, test_P1, test_P5] ]) if args.model: self.save_model(args.model + ".pkl.gz") dropout_p = np.float64(args.dropout).astype( theano.config.floatX) self.dropout.set_value(dropout_p) say("\r\n\n") say( ( "Epoch {}\tcost={:.3f}\tloss={:.3f} {:.3f}\t" \ +"\tMRR={:.2f},{:.2f}\tPPL={:.1f}\t|g|={:.3f}\t[{:.3f}m]\n" ).format( epoch, train_cost / (i+1), train_loss / (i+1), train_loss2 / (i+1), dev_MRR, best_dev, heldout_PPL, float(grad_norm), (time.time()-start_time)/60.0 )) say("\tp_norm: {}\n".format(self.get_pnorm_stat())) say("\n") say("{}".format(result_table)) say("\n")
def train(self, ids_corpus, train, dev=None, test=None, heldout=None): args = self.args dropout_prob = np.float64(args.dropout).astype(theano.config.floatX) batch_size = args.batch_size padding_id = self.padding_id bos_id = self.bos_id eos_id = self.eos_id #train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id, args.loss) updates, lr, gnorm = create_optimization_updates( cost = self.cost, params = self.params, lr = args.learning_rate, method = args.learning )[:3] train_func = theano.function( inputs = [ self.idxs, self.idys ], outputs = [ self.cost, self.loss, gnorm ], updates = updates ) eval_func = theano.function( inputs = [ self.idxs ], #outputs = self.scores2 outputs = self.scores ) nll_func = theano.function( inputs = [ self.idxs, self.idys ], outputs = [ self.nll, self.mask ] ) say("\tp_norm: {}\n".format( self.get_pnorm_stat() )) result_table = PrettyTable(["Epoch", "dev MAP", "dev MRR", "dev P@1", "dev P@5"] + ["tst MAP", "tst MRR", "tst P@1", "tst P@5"]) unchanged = 0 best_dev = -1 dev_MAP = dev_MRR = dev_P1 = dev_P5 = 0 test_MAP = test_MRR = test_P1 = test_P5 = 0 heldout_PPL = -1 start_time = 0 max_epoch = args.max_epoch for epoch in xrange(max_epoch): unchanged += 1 if unchanged > 8: break start_time = time.time() train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id, bos_id, eos_id, auto_encode=True) N =len(train_batches) train_cost = 0.0 train_loss = 0.0 train_loss2 = 0.0 for i in xrange(N): # get current batch t1, b1, t2 = train_batches[i] if args.use_title: idxs, idys = myio.create_one_batch(t1, t2, padding_id) cur_cost, cur_loss, grad_norm = train_func(idxs, idys) train_cost += cur_cost train_loss += cur_loss train_loss2 += cur_loss / idys.shape[0] if args.use_body: idxs, idys = myio.create_one_batch(b1, t2, padding_id) cur_cost, cur_loss, grad_norm = train_func(idxs, idys) train_cost += cur_cost train_loss += cur_loss train_loss2 += cur_loss / idys.shape[0] if i % 10 == 0: say("\r{}/{}".format(i,N)) if i == N-1: self.dropout.set_value(0.0) if dev is not None: dev_MAP, dev_MRR, dev_P1, dev_P5 = self.evaluate(dev, eval_func) if test is not None: test_MAP, test_MRR, test_P1, test_P5 = self.evaluate(test, eval_func) if heldout is not None: heldout_PPL = self.evaluate_perplexity(heldout, nll_func) if dev_MRR > best_dev: unchanged = 0 best_dev = dev_MRR result_table.add_row( [ epoch ] + [ "%.2f" % x for x in [ dev_MAP, dev_MRR, dev_P1, dev_P5 ] + [ test_MAP, test_MRR, test_P1, test_P5 ] ] ) if args.model: self.save_model(args.model+".pkl.gz") dropout_p = np.float64(args.dropout).astype( theano.config.floatX) self.dropout.set_value(dropout_p) say("\r\n\n") say( ( "Epoch {}\tcost={:.3f}\tloss={:.3f} {:.3f}\t" \ +"\tMRR={:.2f},{:.2f}\tPPL={:.1f}\t|g|={:.3f}\t[{:.3f}m]\n" ).format( epoch, train_cost / (i+1), train_loss / (i+1), train_loss2 / (i+1), dev_MRR, best_dev, heldout_PPL, float(grad_norm), (time.time()-start_time)/60.0 )) say("\tp_norm: {}\n".format( self.get_pnorm_stat() )) say("\n") say("{}".format(result_table)) say("\n")