def main(args): raw_corpus = myio.read_corpus(args.corpus) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d=args.hidden_dim, embs=load_embedding_iterator(args.embeddings) if args.embeddings else None) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, max_len=args.max_seq_len) say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus))) padding_id = embedding_layer.vocab_map["<padding>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.dev: dev_raw = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev_raw, padding_id, pad_left=not args.average, merge=args.merge) if args.test: test_raw = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test_raw, padding_id, pad_left=not args.average, merge=args.merge) if args.train: start_time = time.time() train = myio.read_annotations(args.train) train_batches = myio.create_batches(ids_corpus, train, args.batch_size, padding_id, pad_left=not args.average, merge=args.merge) say("{} to create batches\n".format(time.time() - start_time)) say("{} batches, {} tokens in total, {} triples in total\n".format( len(train_batches), sum(len(x[0].ravel()) for x in train_batches), sum(len(x[1].ravel()) for x in train_batches))) train_batches = None model = Model(args, embedding_layer, weights=weights if args.reweight else None) model.ready() # set parameters using pre-trained network if args.load_pretrain: model.encoder.load_pretrained_parameters(args) model.train(ids_corpus, train, (dev, dev_raw) if args.dev else None, (test, test_raw) if args.test else None)
def main(args): raw_corpus = myio.read_corpus(args.corpus) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d = args.hidden_dim, cut_off = args.cut_off, embs = load_embedding_iterator(args.embeddings) if args.embeddings else None ) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer) say("vocab size={}, corpus size={}\n".format( embedding_layer.n_V, len(raw_corpus) )) padding_id = embedding_layer.vocab_map["<padding>"] bos_id = embedding_layer.vocab_map["<s>"] eos_id = embedding_layer.vocab_map["</s>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.dev: dev = myio.read_annotations(args.dev, K_neg=20, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev, padding_id) if args.test: test = myio.read_annotations(args.test, K_neg=20, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test, padding_id) if args.heldout: with open(args.heldout) as fin: heldout_ids = fin.read().split() heldout_corpus = dict((id, ids_corpus[id]) for id in heldout_ids if id in ids_corpus) train_corpus = dict((id, ids_corpus[id]) for id in ids_corpus if id not in heldout_corpus) heldout = myio.create_batches(heldout_corpus, [ ], args.batch_size, padding_id, bos_id, eos_id, auto_encode=True) heldout = [ myio.create_one_batch(b1, t2, padding_id) for t1, b1, t2 in heldout ] say("heldout examples={}\n".format(len(heldout_corpus))) if args.train: model = Model(args, embedding_layer, weights=weights if args.reweight else None) start_time = time.time() train = myio.read_annotations(args.train) if not args.use_anno: train = [ ] train_batches = myio.create_batches(ids_corpus, train, args.batch_size, model.padding_id, model.bos_id, model.eos_id, auto_encode=True) say("{} to create batches\n".format(time.time()-start_time)) model.ready() model.train( ids_corpus if not args.heldout else train_corpus, train, dev if args.dev else None, test if args.test else None, heldout if args.heldout else None )
def main(args): raw_corpus = myio.read_corpus(args.corpus) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d = args.hidden_dim, embs = load_embedding_iterator(args.embeddings) if args.embeddings else None ) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, max_len=args.max_seq_len) say("vocab size={}, corpus size={}\n".format( embedding_layer.n_V, len(raw_corpus) )) padding_id = embedding_layer.vocab_map["<padding>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.dev: dev_raw = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev_raw, padding_id, pad_left=not args.average, merge=args.merge) if args.test: test_raw = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test_raw, padding_id, pad_left=not args.average, merge=args.merge) if args.train: start_time = time.time() train = myio.read_annotations(args.train) train_batches = myio.create_batches(ids_corpus, train, args.batch_size, padding_id, pad_left = not args.average, merge=args.merge) say("{} to create batches\n".format(time.time()-start_time)) say("{} batches, {} tokens in total, {} triples in total\n".format( len(train_batches), sum(len(x[0].ravel()) for x in train_batches), sum(len(x[1].ravel()) for x in train_batches) )) train_batches = None model = Model(args, embedding_layer, weights=weights if args.reweight else None) model.ready() # set parameters using pre-trained network if args.load_pretrain: model.encoder.load_pretrained_parameters(args) model.train( ids_corpus, train, (dev, dev_raw) if args.dev else None, (test, test_raw) if args.test else None )
def main(args): raw_corpus = myio.read_corpus(args.corpus) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d=args.hidden_dim, cut_off=args.cut_off, embs=load_embedding_iterator(args.embeddings) if args.embeddings else None) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer) say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus))) padding_id = embedding_layer.vocab_map["<padding>"] bos_id = embedding_layer.vocab_map["<s>"] eos_id = embedding_layer.vocab_map["</s>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.dev: dev = myio.read_annotations(args.dev, K_neg=20, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev, padding_id) if args.test: test = myio.read_annotations(args.test, K_neg=20, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test, padding_id) if args.heldout: with open(args.heldout) as fin: heldout_ids = fin.read().split() heldout_corpus = dict( (id, ids_corpus[id]) for id in heldout_ids if id in ids_corpus) train_corpus = dict((id, ids_corpus[id]) for id in ids_corpus if id not in heldout_corpus) heldout = myio.create_batches(heldout_corpus, [], args.batch_size, padding_id, bos_id, eos_id, auto_encode=True) heldout = [ myio.create_one_batch(b1, t2, padding_id) for t1, b1, t2 in heldout ] say("heldout examples={}\n".format(len(heldout_corpus))) if args.train: model = Model(args, embedding_layer, weights=weights if args.reweight else None) start_time = time.time() train = myio.read_annotations(args.train) if not args.use_anno: train = [] train_batches = myio.create_batches(ids_corpus, train, args.batch_size, model.padding_id, model.bos_id, model.eos_id, auto_encode=True) say("{} to create batches\n".format(time.time() - start_time)) model.ready() model.train(ids_corpus if not args.heldout else train_corpus, train, dev if args.dev else None, test if args.test else None, heldout if args.heldout else None)
def main(): print 'Starting at: {}\n'.format(datetime.now()) raw_corpus = qaio.read_corpus(args.corpus) embedding_layer = create_embedding_layer( n_d=200, embs=load_embedding_iterator(args.embeddings), only_words=False if args.use_embeddings else True, trainable=args.trainable) print("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus))) if args.reweight: weights = qaio.create_idf_weights(args.corpus, embedding_layer) label_tags = pickle.load(open(args.tags_file, 'rb')) print '\nloaded {} tags'.format(len(label_tags)) raw_corpus_tags = tpio.read_corpus(args.corpus_w_tags, with_tags=True) ids_corpus_tags = tpio.map_corpus(raw_corpus_tags, embedding_layer, label_tags, max_len=args.max_seq_len) padding_id = embedding_layer.vocab_map["<padding>"] if args.dev: dev = qaio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus_tags, dev, padding_id, N_neg=args.n_neg, samples_file=args.samples_file) if args.test: test = qaio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus_tags, test, padding_id, N_neg=args.n_neg, samples_file=args.samples_file) if args.train: train = qaio.read_annotations(args.train) if args.layer.lower() == "lstm": from models import LstmQRTP as Model elif args.layer.lower() in ["bilstm", "bigru"]: from models import BiRNNQRTP as Model elif args.layer.lower() == "cnn": from models import CnnQRTP as Model elif args.layer.lower() == "gru": from models import GruQRTP as Model else: raise Exception("no correct layer given") model = Model(args, embedding_layer, len(label_tags), weights=weights if args.reweight else None) model.ready() print 'total (non) trainable params: ', model.num_parameters() if args.load_pre_trained_part: # need to remove the old assigns to embeddings model.init_assign_ops = model.load_pre_trained_part( args.load_pre_trained_part) print '\nmodel init_assign_ops: {}\n'.format(model.init_assign_ops) model.train_model(ids_corpus_tags, train, dev=dev if args.dev else None, test=test if args.test else None) print '\nEnded at: {}'.format(datetime.now())
def main(): print 'Starting at: {}\n'.format(datetime.now()) raw_corpus = myio.read_corpus(args.corpus) embedding_layer = create_embedding_layer( n_d=200, embs=load_embedding_iterator(args.embeddings), only_words=False if args.use_embeddings else True, trainable=args.trainable ) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, max_len=args.max_seq_len) print("vocab size={}, corpus size={}\n".format( embedding_layer.n_V, len(raw_corpus) )) padding_id = embedding_layer.vocab_map["<padding>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.layer.lower() == "lstm": from models import LstmQR as Model elif args.layer.lower() in ["bilstm", "bigru"]: from models import BiRNNQR as Model elif args.layer.lower() == "cnn": from models import CnnQR as Model elif args.layer.lower() == "gru": from models import GruQR as Model else: raise Exception("no correct layer given") if args.dev: dev = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev, padding_id, pad_left=False) if args.test: test = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test, padding_id, pad_left=False) model = Model(args, embedding_layer, weights=weights if args.reweight else None) model.ready() print 'total (non) trainable params: ', model.num_parameters() if args.load_pre_trained_part: # need to remove the old assigns to embeddings model.init_assign_ops = model.load_pre_trained_part(args.load_pre_trained_part) print '\nmodel init_assign_ops: {}\n'.format(model.init_assign_ops) if args.train: start_time = time.time() train = myio.read_annotations(args.train) train_batches = myio.create_batches( ids_corpus, train, args.batch_size, padding_id, pad_left=False ) print("{} to create batches\n".format(time.time()-start_time)) print("{} batches, {} tokens in total, {} triples in total\n".format( len(train_batches), sum(len(x[0].ravel())+len(x[1].ravel()) for x in train_batches), sum(len(x[2].ravel()) for x in train_batches) )) model.train_model( ids_corpus, train, dev=dev if args.dev else None, test=test if args.test else None ) print '\nEnded at: {}'.format(datetime.now())
def main(args): raw_corpus = myio.read_corpus(args.corpus, args.translations or None, args.translatable_ids or None, args.generated_questions_train or None) generated_questions_eval = myio.read_generated_questions( args.generated_questions) embedding_layer = None if args.trainable_embeddings == 1: embedding_layer = myio.create_embedding_layer( raw_corpus, n_d=args.hidden_dim, cut_off=args.cut_off, embs=load_embedding_iterator(args.embeddings) if args.embeddings else None, fix_init_embs=False) else: embedding_layer = myio.create_embedding_layer( raw_corpus, n_d=args.hidden_dim, cut_off=args.cut_off, embs=load_embedding_iterator(args.embeddings) if args.embeddings else None) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, max_len=args.max_seq_len, generated_questions=generated_questions_eval) say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus))) padding_id = embedding_layer.vocab_map["<padding>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.dev: # dev = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1) dev = myio.read_annotations(args.dev, K_neg=args.dev_pool_size, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev, padding_id, pad_left=not args.average) if args.test: test = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test, padding_id, pad_left=not args.average) if args.train: start_time = time.time() train = myio.read_annotations( args.train, training_data_percent=args.training_data_percent) train_batches = myio.create_batches(ids_corpus, train, args.batch_size, padding_id, pad_left=not args.average, include_generated_questions=True) say("{} to create batches\n".format(time.time() - start_time)) say("{} batches, {} tokens in total, {} triples in total\n".format( len(train_batches), sum(len(x[0].ravel()) + len(x[1].ravel()) for x in train_batches), sum(len(x[2].ravel()) for x in train_batches))) train_batches = None model = Model(args, embedding_layer, weights=weights if args.reweight else None) # print('args.average: '+args.average) model.ready() # # # set parameters using pre-trained network if args.do_train == 1: if args.load_pretrain: model.load_pretrained_parameters(args) model.train(ids_corpus, train, dev if args.dev else None, test if args.test else None) # AVERAGE THE PREDICTIONS OBTAINED BY RUNNING THE MODEL 10 TIMES if args.do_evaluate == 1: model.load_pretrained_parameters(args) # model.set_model(model.load_model(args.load_pretrain)) for i in range(1): r = model.just_eval(dev if args.dev else None, test if args.test else None) # ANALYZE the results if len(args.analyze_file.strip()) > 0: model.load_pretrained_parameters(args) file_name = args.analyze_file.strip( ) # 'AskUbuntu.Rcnn_analysis3.gt(es)-gt.txt' model.analyze(file_name, embedding_layer, dev)