def main(): print 'Starting at: {}\n'.format(datetime.now()) s_time = time.time() df = read_df(args.df_path) df = df.fillna(u'') label_tags = pickle.load(open(args.tags_file, 'rb')) print '\nloaded {} tags'.format(len(label_tags)) raw_corpus = myio.read_corpus(args.corpus_w_tags, with_tags=True) embedding_layer = create_embedding_layer( n_d=200, embs=load_embedding_iterator(args.embeddings), only_words=False if args.use_embeddings else True, # only_words will take the words from embedding file and make random initial embeddings trainable=args.trainable ) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, label_tags, max_len=args.max_seq_len) print("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus))) padding_id = embedding_layer.vocab_map["<padding>"] if args.reweight: weights = myio.create_idf_weights(args.corpus_w_tags, embedding_layer, with_tags=True) if args.layer.lower() == "lstm": from models import LstmMultiTagsClassifier as Model elif args.layer.lower() in ["bilstm", "bigru"]: from models import BiRNNMultiTagsClassifier as Model elif args.layer.lower() == "cnn": from models import CnnMultiTagsClassifier as Model elif args.layer.lower() == "gru": from models import GruMultiTagsClassifier as Model else: raise Exception("no correct layer given") if args.cross_val: train, dev, test = myio.create_cross_val_batches(df, ids_corpus, args.batch_size, padding_id) else: dev = list(myio.create_batches( df, ids_corpus, 'dev', args.batch_size, padding_id, N_neg=args.n_neg, samples_file=args.samples_file)) test = list(myio.create_batches( df, ids_corpus, 'test', args.batch_size, padding_id, N_neg=args.n_neg, samples_file=args.samples_file)) # baselines_eval(train, dev, test) model = Model(args, embedding_layer, len(label_tags), weights=weights if args.reweight else None) model.ready() print 'total (non) trainable params: ', model.num_parameters() if args.load_pre_trained_part: # need to remove the old assigns to embeddings model.init_assign_ops = model.load_pre_trained_part(args.load_pre_trained_part) print '\nmodel init_assign_ops: {}\n'.format(model.init_assign_ops) model.train_model(df, ids_corpus, dev=dev, test=test) print '\nEnded at: {}'.format(datetime.now())
def __init__(self, model_path, corpus_path, emb_path, session, layer='lstm'): raw_corpus = myio.read_corpus(corpus_path) embedding_layer = create_embedding_layer(n_d=10, embs=load_embedding_iterator( args.embeddings), only_words=False) # weights = myio.create_idf_weights(corpus_path, embedding_layer) # todo say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus))) if layer.lower() == "lstm": from models import LstmQR as Model elif layer.lower() in ["bilstm", "bigru"]: from models import BiRNNQR as Model elif layer.lower() == "cnn": from models import CnnQR as Model elif layer.lower() == "gru": from models import GruQR as Model model = Model(args={"layer": args.layer}, embedding_layer=embedding_layer, weights=None) model.load_n_set_model(model_path, session) say("model initialized\n") self.model = model def score_func(titles, bodies, cur_sess): feed_dict = { self.model.titles_words_ids_placeholder: titles.T, # IT IS TRANSPOSE ;) self.model.bodies_words_ids_placeholder: bodies.T, # IT IS TRANSPOSE ;) self.model.dropout_prob: 0., } _scores = cur_sess.run(self.model.scores, feed_dict) return _scores self.score_func = score_func say("scoring function compiled\n")
def main(): print 'Starting at: {}\n'.format(datetime.now()) raw_corpus = qaio.read_corpus(args.corpus) embedding_layer = create_embedding_layer( n_d=200, embs=load_embedding_iterator(args.embeddings), only_words=False if args.use_embeddings else True, trainable=args.trainable) print("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus))) if args.reweight: weights = qaio.create_idf_weights(args.corpus, embedding_layer) label_tags = pickle.load(open(args.tags_file, 'rb')) print '\nloaded {} tags'.format(len(label_tags)) raw_corpus_tags = tpio.read_corpus(args.corpus_w_tags, with_tags=True) ids_corpus_tags = tpio.map_corpus(raw_corpus_tags, embedding_layer, label_tags, max_len=args.max_seq_len) padding_id = embedding_layer.vocab_map["<padding>"] if args.dev: dev = qaio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus_tags, dev, padding_id, N_neg=args.n_neg, samples_file=args.samples_file) if args.test: test = qaio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus_tags, test, padding_id, N_neg=args.n_neg, samples_file=args.samples_file) if args.train: train = qaio.read_annotations(args.train) if args.layer.lower() == "lstm": from models import LstmQRTP as Model elif args.layer.lower() in ["bilstm", "bigru"]: from models import BiRNNQRTP as Model elif args.layer.lower() == "cnn": from models import CnnQRTP as Model elif args.layer.lower() == "gru": from models import GruQRTP as Model else: raise Exception("no correct layer given") model = Model(args, embedding_layer, len(label_tags), weights=weights if args.reweight else None) model.ready() print 'total (non) trainable params: ', model.num_parameters() if args.load_pre_trained_part: # need to remove the old assigns to embeddings model.init_assign_ops = model.load_pre_trained_part( args.load_pre_trained_part) print '\nmodel init_assign_ops: {}\n'.format(model.init_assign_ops) model.train_model(ids_corpus_tags, train, dev=dev if args.dev else None, test=test if args.test else None) print '\nEnded at: {}'.format(datetime.now())
def main(): print 'Starting at: {}\n'.format(datetime.now()) raw_corpus = myio.read_corpus(args.corpus) embedding_layer = create_embedding_layer( n_d=200, embs=load_embedding_iterator(args.embeddings), only_words=False if args.use_embeddings else True, trainable=args.trainable ) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, max_len=args.max_seq_len) print("vocab size={}, corpus size={}\n".format( embedding_layer.n_V, len(raw_corpus) )) padding_id = embedding_layer.vocab_map["<padding>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.layer.lower() == "lstm": from models import LstmQR as Model elif args.layer.lower() in ["bilstm", "bigru"]: from models import BiRNNQR as Model elif args.layer.lower() == "cnn": from models import CnnQR as Model elif args.layer.lower() == "gru": from models import GruQR as Model else: raise Exception("no correct layer given") if args.dev: dev = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev, padding_id, pad_left=False) if args.test: test = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test, padding_id, pad_left=False) model = Model(args, embedding_layer, weights=weights if args.reweight else None) model.ready() print 'total (non) trainable params: ', model.num_parameters() if args.load_pre_trained_part: # need to remove the old assigns to embeddings model.init_assign_ops = model.load_pre_trained_part(args.load_pre_trained_part) print '\nmodel init_assign_ops: {}\n'.format(model.init_assign_ops) if args.train: start_time = time.time() train = myio.read_annotations(args.train) train_batches = myio.create_batches( ids_corpus, train, args.batch_size, padding_id, pad_left=False ) print("{} to create batches\n".format(time.time()-start_time)) print("{} batches, {} tokens in total, {} triples in total\n".format( len(train_batches), sum(len(x[0].ravel())+len(x[1].ravel()) for x in train_batches), sum(len(x[2].ravel()) for x in train_batches) )) model.train_model( ids_corpus, train, dev=dev if args.dev else None, test=test if args.test else None ) print '\nEnded at: {}'.format(datetime.now())
default="") # to write in argparser.add_argument("--results_file", type=str, default="") # to write in args = argparser.parse_args() print '\n', args, '\n' df = read_df(args.df_corpus) df = df.fillna(u'') label_tags = pickle.load(open(args.tags_file, 'rb')) raw_corpus = myio.read_corpus(args.corpus_w_tags, with_tags=True) embedding_layer = create_embedding_layer(n_d=10, embs=load_embedding_iterator( args.embeddings), only_words=False) with tf.Session() as sess: myqrapi = TPAPI(args.model, embedding_layer, sess, len(label_tags), args.layer) embedding_layer = myqrapi.model.embedding_layer ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, label_tags, max_len=args.max_seq_len) print("vocab size={}, corpus size={}\n".format(embedding_layer.n_V,