def __init__(self, model_path, corpus_path, emb_path): raw_corpus = myio.read_corpus(corpus_path) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d = 10, cut_off = 1, embs = load_embedding_iterator(emb_path) ) weights = myio.create_idf_weights(corpus_path, embedding_layer) say("vocab size={}, corpus size={}\n".format( embedding_layer.n_V, len(raw_corpus) )) model = Model(args=None, embedding_layer=embedding_layer, weights=weights) model_data = model.load_model(model_path) model.set_model(model_data) model.dropout.set_value(0.0) say("model initialized\n") score_func = theano.function( inputs = [ model.idts, model.idbs ], outputs = model.scores, on_unused_input='ignore' ) self.model = model self.score_func = score_func say("scoring function compiled\n")
def main(): print 'Starting at: {}\n'.format(datetime.now()) s_time = time.time() df = read_df(args.df_path) df = df.fillna(u'') label_tags = pickle.load(open(args.tags_file, 'rb')) print '\nloaded {} tags'.format(len(label_tags)) raw_corpus = myio.read_corpus(args.corpus_w_tags, with_tags=True) embedding_layer = create_embedding_layer( n_d=200, embs=load_embedding_iterator(args.embeddings), only_words=False if args.use_embeddings else True, # only_words will take the words from embedding file and make random initial embeddings trainable=args.trainable ) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, label_tags, max_len=args.max_seq_len) print("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus))) padding_id = embedding_layer.vocab_map["<padding>"] if args.reweight: weights = myio.create_idf_weights(args.corpus_w_tags, embedding_layer, with_tags=True) if args.layer.lower() == "lstm": from models import LstmMultiTagsClassifier as Model elif args.layer.lower() in ["bilstm", "bigru"]: from models import BiRNNMultiTagsClassifier as Model elif args.layer.lower() == "cnn": from models import CnnMultiTagsClassifier as Model elif args.layer.lower() == "gru": from models import GruMultiTagsClassifier as Model else: raise Exception("no correct layer given") if args.cross_val: train, dev, test = myio.create_cross_val_batches(df, ids_corpus, args.batch_size, padding_id) else: dev = list(myio.create_batches( df, ids_corpus, 'dev', args.batch_size, padding_id, N_neg=args.n_neg, samples_file=args.samples_file)) test = list(myio.create_batches( df, ids_corpus, 'test', args.batch_size, padding_id, N_neg=args.n_neg, samples_file=args.samples_file)) # baselines_eval(train, dev, test) model = Model(args, embedding_layer, len(label_tags), weights=weights if args.reweight else None) model.ready() print 'total (non) trainable params: ', model.num_parameters() if args.load_pre_trained_part: # need to remove the old assigns to embeddings model.init_assign_ops = model.load_pre_trained_part(args.load_pre_trained_part) print '\nmodel init_assign_ops: {}\n'.format(model.init_assign_ops) model.train_model(df, ids_corpus, dev=dev, test=test) print '\nEnded at: {}'.format(datetime.now())
def main(args): raw_corpus = myio.read_corpus(args.corpus) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d=args.hidden_dim, embs=load_embedding_iterator(args.embeddings) if args.embeddings else None) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, max_len=args.max_seq_len) say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus))) padding_id = embedding_layer.vocab_map["<padding>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.dev: dev_raw = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev_raw, padding_id, pad_left=not args.average, merge=args.merge) if args.test: test_raw = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test_raw, padding_id, pad_left=not args.average, merge=args.merge) if args.train: start_time = time.time() train = myio.read_annotations(args.train) train_batches = myio.create_batches(ids_corpus, train, args.batch_size, padding_id, pad_left=not args.average, merge=args.merge) say("{} to create batches\n".format(time.time() - start_time)) say("{} batches, {} tokens in total, {} triples in total\n".format( len(train_batches), sum(len(x[0].ravel()) for x in train_batches), sum(len(x[1].ravel()) for x in train_batches))) train_batches = None model = Model(args, embedding_layer, weights=weights if args.reweight else None) model.ready() # set parameters using pre-trained network if args.load_pretrain: model.encoder.load_pretrained_parameters(args) model.train(ids_corpus, train, (dev, dev_raw) if args.dev else None, (test, test_raw) if args.test else None)
def main(args): raw_corpus = myio.read_corpus(args.corpus) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d = args.hidden_dim, cut_off = args.cut_off, embs = load_embedding_iterator(args.embeddings) if args.embeddings else None ) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer) say("vocab size={}, corpus size={}\n".format( embedding_layer.n_V, len(raw_corpus) )) padding_id = embedding_layer.vocab_map["<padding>"] bos_id = embedding_layer.vocab_map["<s>"] eos_id = embedding_layer.vocab_map["</s>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.dev: dev = myio.read_annotations(args.dev, K_neg=20, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev, padding_id) if args.test: test = myio.read_annotations(args.test, K_neg=20, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test, padding_id) if args.heldout: with open(args.heldout) as fin: heldout_ids = fin.read().split() heldout_corpus = dict((id, ids_corpus[id]) for id in heldout_ids if id in ids_corpus) train_corpus = dict((id, ids_corpus[id]) for id in ids_corpus if id not in heldout_corpus) heldout = myio.create_batches(heldout_corpus, [ ], args.batch_size, padding_id, bos_id, eos_id, auto_encode=True) heldout = [ myio.create_one_batch(b1, t2, padding_id) for t1, b1, t2 in heldout ] say("heldout examples={}\n".format(len(heldout_corpus))) if args.train: model = Model(args, embedding_layer, weights=weights if args.reweight else None) start_time = time.time() train = myio.read_annotations(args.train) if not args.use_anno: train = [ ] train_batches = myio.create_batches(ids_corpus, train, args.batch_size, model.padding_id, model.bos_id, model.eos_id, auto_encode=True) say("{} to create batches\n".format(time.time()-start_time)) model.ready() model.train( ids_corpus if not args.heldout else train_corpus, train, dev if args.dev else None, test if args.test else None, heldout if args.heldout else None )
def main(args): raw_corpus = myio.read_corpus(args.corpus) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d = args.hidden_dim, embs = load_embedding_iterator(args.embeddings) if args.embeddings else None ) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, max_len=args.max_seq_len) say("vocab size={}, corpus size={}\n".format( embedding_layer.n_V, len(raw_corpus) )) padding_id = embedding_layer.vocab_map["<padding>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.dev: dev_raw = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev_raw, padding_id, pad_left=not args.average, merge=args.merge) if args.test: test_raw = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test_raw, padding_id, pad_left=not args.average, merge=args.merge) if args.train: start_time = time.time() train = myio.read_annotations(args.train) train_batches = myio.create_batches(ids_corpus, train, args.batch_size, padding_id, pad_left = not args.average, merge=args.merge) say("{} to create batches\n".format(time.time()-start_time)) say("{} batches, {} tokens in total, {} triples in total\n".format( len(train_batches), sum(len(x[0].ravel()) for x in train_batches), sum(len(x[1].ravel()) for x in train_batches) )) train_batches = None model = Model(args, embedding_layer, weights=weights if args.reweight else None) model.ready() # set parameters using pre-trained network if args.load_pretrain: model.encoder.load_pretrained_parameters(args) model.train( ids_corpus, train, (dev, dev_raw) if args.dev else None, (test, test_raw) if args.test else None )
def api(label_tags, test_y, y_scores, all_ids): eval_samples = [] for sample in range(test_y.shape[0]): if (test_y[sample, :] == np.ones(test_y.shape[1])).any(): eval_samples.append(sample) test_y, y_scores = test_y[eval_samples, :], y_scores[eval_samples, :] ev = Evaluation(y_scores, None, test_y) all_rankedat10_tags = [] query_ids = [] for sample_id, sample_output in zip(eval_samples, y_scores): q_id = all_ids[sample_id] query_ids.append(q_id) cols = np.argsort(sample_output)[-10:] rankedat10_tags = [] for col in cols[::-1]: label_name = label_tags[col] rankedat10_tags.append(label_name) all_rankedat10_tags.append(rankedat10_tags) all_Pat5, all_Pat10, all_Rat5, all_Rat10 = \ ev.Precision(5, True), ev.Precision(10, True), ev.Recall(5, True), ev.Recall(10, True) upper_bounds_pat5 = ev.upper_bound(5, True) upper_bounds_pat10 = ev.upper_bound(10, True) all_MAP = ev.MeanAveragePrecision(True) assert len(all_Pat5) == len(all_rankedat10_tags) R = (query_ids, all_rankedat10_tags, list(all_Pat5), list(all_Pat10), list(all_Rat5), list(all_Rat10), upper_bounds_pat5, upper_bounds_pat10, all_MAP) raw_corpus = myio.read_corpus(args.corpus_w_tags, with_tags=True) with open(args.results_file, 'w') as f: for i in range(len(R[0])): query_id, rankedat10_tags, Pat5, Pat10, Rat5, Rat10, UB5, UB10, MAP = \ R[0][i], R[1][i], R[2][i], R[3][i], R[4][i], R[5][i], R[6][i], R[7][i], R[8][i] real_tags = raw_corpus[str(query_id)][2] real_tags = list(set(real_tags) & set(label_tags)) real_tags = " ".join([str(x) for x in real_tags]) rankedat10_tags = " ".join([str(x) for x in rankedat10_tags]) f.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( query_id, real_tags, rankedat10_tags, Pat5, Pat10, Rat5, Rat10, UB5, UB10, MAP))
def __init__(self, model_path, corpus_path, emb_path, session, layer='lstm'): raw_corpus = myio.read_corpus(corpus_path) embedding_layer = create_embedding_layer(n_d=10, embs=load_embedding_iterator( args.embeddings), only_words=False) # weights = myio.create_idf_weights(corpus_path, embedding_layer) # todo say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus))) if layer.lower() == "lstm": from models import LstmQR as Model elif layer.lower() in ["bilstm", "bigru"]: from models import BiRNNQR as Model elif layer.lower() == "cnn": from models import CnnQR as Model elif layer.lower() == "gru": from models import GruQR as Model model = Model(args={"layer": args.layer}, embedding_layer=embedding_layer, weights=None) model.load_n_set_model(model_path, session) say("model initialized\n") self.model = model def score_func(titles, bodies, cur_sess): feed_dict = { self.model.titles_words_ids_placeholder: titles.T, # IT IS TRANSPOSE ;) self.model.bodies_words_ids_placeholder: bodies.T, # IT IS TRANSPOSE ;) self.model.dropout_prob: 0., } _scores = cur_sess.run(self.model.scores, feed_dict) return _scores self.score_func = score_func say("scoring function compiled\n")
default="") # embeddings file argparser.add_argument("--test_file", type=str, default="") argparser.add_argument("--full_results_file", type=str, default="") # to write in argparser.add_argument("--results_file", type=str, default="") # to write in argparser.add_argument("--layer", type=str, default="lstm") args = argparser.parse_args() print '\n', args, '\n' with tf.Session() as sess: myqrapi = QRAPI(args.model, args.corpus, args.embeddings, sess, args.layer) raw_corpus = myio.read_corpus(args.corpus) embedding_layer = myqrapi.model.embedding_layer ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, max_len=100) test = myio.read_annotations(args.test_file, K_neg=-1, prune_pos_cnt=-1) test = create_eval_batches(ids_corpus, test, myqrapi.model.padding_id, pad_left=not myqrapi.model.args.average) testmap, testmrr, testpat1, testpat5, rank_labels, rank_ids, qids, rank_scores = myqrapi.evaluate( test, sess) if args.full_results_file: with open(args.full_results_file, 'w') as f: