def __init__(self, model_path, corpus_path, emb_path): raw_corpus = myio.read_corpus(corpus_path) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d = 10, cut_off = 1, embs = load_embedding_iterator(emb_path) ) weights = myio.create_idf_weights(corpus_path, embedding_layer) say("vocab size={}, corpus size={}\n".format( embedding_layer.n_V, len(raw_corpus) )) model = Model(args=None, embedding_layer=embedding_layer, weights=weights) model_data = model.load_model(model_path) model.set_model(model_data) model.dropout.set_value(0.0) say("model initialized\n") score_func = theano.function( inputs = [ model.idts, model.idbs ], outputs = model.scores, on_unused_input='ignore' ) self.model = model self.score_func = score_func say("scoring function compiled\n")
def main(): print 'Starting at: {}\n'.format(datetime.now()) s_time = time.time() df = read_df(args.df_path) df = df.fillna(u'') label_tags = pickle.load(open(args.tags_file, 'rb')) print '\nloaded {} tags'.format(len(label_tags)) raw_corpus = myio.read_corpus(args.corpus_w_tags, with_tags=True) embedding_layer = create_embedding_layer( n_d=200, embs=load_embedding_iterator(args.embeddings), only_words=False if args.use_embeddings else True, # only_words will take the words from embedding file and make random initial embeddings trainable=args.trainable ) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, label_tags, max_len=args.max_seq_len) print("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus))) padding_id = embedding_layer.vocab_map["<padding>"] if args.reweight: weights = myio.create_idf_weights(args.corpus_w_tags, embedding_layer, with_tags=True) if args.layer.lower() == "lstm": from models import LstmMultiTagsClassifier as Model elif args.layer.lower() in ["bilstm", "bigru"]: from models import BiRNNMultiTagsClassifier as Model elif args.layer.lower() == "cnn": from models import CnnMultiTagsClassifier as Model elif args.layer.lower() == "gru": from models import GruMultiTagsClassifier as Model else: raise Exception("no correct layer given") if args.cross_val: train, dev, test = myio.create_cross_val_batches(df, ids_corpus, args.batch_size, padding_id) else: dev = list(myio.create_batches( df, ids_corpus, 'dev', args.batch_size, padding_id, N_neg=args.n_neg, samples_file=args.samples_file)) test = list(myio.create_batches( df, ids_corpus, 'test', args.batch_size, padding_id, N_neg=args.n_neg, samples_file=args.samples_file)) # baselines_eval(train, dev, test) model = Model(args, embedding_layer, len(label_tags), weights=weights if args.reweight else None) model.ready() print 'total (non) trainable params: ', model.num_parameters() if args.load_pre_trained_part: # need to remove the old assigns to embeddings model.init_assign_ops = model.load_pre_trained_part(args.load_pre_trained_part) print '\nmodel init_assign_ops: {}\n'.format(model.init_assign_ops) model.train_model(df, ids_corpus, dev=dev, test=test) print '\nEnded at: {}'.format(datetime.now())
def __init__(self, model_path, corpus_path, emb_path): raw_corpus = myio.read_corpus(corpus_path) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d=10, cut_off=1, embs=load_embedding_iterator(emb_path)) weights = myio.create_idf_weights(corpus_path, embedding_layer) say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus))) model = Model(args=None, embedding_layer=embedding_layer, weights=weights) model_data = model.load_model(model_path) model.set_model(model_data) model.dropout.set_value(0.0) say("model initialized\n") score_func = theano.function(inputs=[model.idts, model.idbs], outputs=model.scores, on_unused_input='ignore') self.model = model self.score_func = score_func say("scoring function compiled\n")
def main(args): raw_corpus = myio.read_corpus(args.corpus) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d=args.hidden_dim, embs=load_embedding_iterator(args.embeddings) if args.embeddings else None) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, max_len=args.max_seq_len) say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus))) padding_id = embedding_layer.vocab_map["<padding>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.dev: dev_raw = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev_raw, padding_id, pad_left=not args.average, merge=args.merge) if args.test: test_raw = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test_raw, padding_id, pad_left=not args.average, merge=args.merge) if args.train: start_time = time.time() train = myio.read_annotations(args.train) train_batches = myio.create_batches(ids_corpus, train, args.batch_size, padding_id, pad_left=not args.average, merge=args.merge) say("{} to create batches\n".format(time.time() - start_time)) say("{} batches, {} tokens in total, {} triples in total\n".format( len(train_batches), sum(len(x[0].ravel()) for x in train_batches), sum(len(x[1].ravel()) for x in train_batches))) train_batches = None model = Model(args, embedding_layer, weights=weights if args.reweight else None) model.ready() # set parameters using pre-trained network if args.load_pretrain: model.encoder.load_pretrained_parameters(args) model.train(ids_corpus, train, (dev, dev_raw) if args.dev else None, (test, test_raw) if args.test else None)
def main(args): raw_corpus = myio.read_corpus(args.corpus) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d = args.hidden_dim, cut_off = args.cut_off, embs = load_embedding_iterator(args.embeddings) if args.embeddings else None ) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer) say("vocab size={}, corpus size={}\n".format( embedding_layer.n_V, len(raw_corpus) )) padding_id = embedding_layer.vocab_map["<padding>"] bos_id = embedding_layer.vocab_map["<s>"] eos_id = embedding_layer.vocab_map["</s>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.dev: dev = myio.read_annotations(args.dev, K_neg=20, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev, padding_id) if args.test: test = myio.read_annotations(args.test, K_neg=20, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test, padding_id) if args.heldout: with open(args.heldout) as fin: heldout_ids = fin.read().split() heldout_corpus = dict((id, ids_corpus[id]) for id in heldout_ids if id in ids_corpus) train_corpus = dict((id, ids_corpus[id]) for id in ids_corpus if id not in heldout_corpus) heldout = myio.create_batches(heldout_corpus, [ ], args.batch_size, padding_id, bos_id, eos_id, auto_encode=True) heldout = [ myio.create_one_batch(b1, t2, padding_id) for t1, b1, t2 in heldout ] say("heldout examples={}\n".format(len(heldout_corpus))) if args.train: model = Model(args, embedding_layer, weights=weights if args.reweight else None) start_time = time.time() train = myio.read_annotations(args.train) if not args.use_anno: train = [ ] train_batches = myio.create_batches(ids_corpus, train, args.batch_size, model.padding_id, model.bos_id, model.eos_id, auto_encode=True) say("{} to create batches\n".format(time.time()-start_time)) model.ready() model.train( ids_corpus if not args.heldout else train_corpus, train, dev if args.dev else None, test if args.test else None, heldout if args.heldout else None )
def create_embedding_layer(path): embedding_layer = EmbeddingLayer( n_d=200, vocab=["<unk>", "<padding>"], embs=load_embedding_iterator(path), oov="<unk>", #fix_init_embs = True fix_init_embs=False) return embedding_layer
def create_embedding_layer(path): embedding_layer = EmbeddingLayer( n_d = 200, vocab = [ "<unk>", "<padding>" ], embs = load_embedding_iterator(path), oov = "<unk>", #fix_init_embs = True fix_init_embs = False ) return embedding_layer
def main(args): raw_corpus = myio.read_corpus(args.corpus) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d = args.hidden_dim, embs = load_embedding_iterator(args.embeddings) if args.embeddings else None ) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, max_len=args.max_seq_len) say("vocab size={}, corpus size={}\n".format( embedding_layer.n_V, len(raw_corpus) )) padding_id = embedding_layer.vocab_map["<padding>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.dev: dev_raw = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev_raw, padding_id, pad_left=not args.average, merge=args.merge) if args.test: test_raw = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test_raw, padding_id, pad_left=not args.average, merge=args.merge) if args.train: start_time = time.time() train = myio.read_annotations(args.train) train_batches = myio.create_batches(ids_corpus, train, args.batch_size, padding_id, pad_left = not args.average, merge=args.merge) say("{} to create batches\n".format(time.time()-start_time)) say("{} batches, {} tokens in total, {} triples in total\n".format( len(train_batches), sum(len(x[0].ravel()) for x in train_batches), sum(len(x[1].ravel()) for x in train_batches) )) train_batches = None model = Model(args, embedding_layer, weights=weights if args.reweight else None) model.ready() # set parameters using pre-trained network if args.load_pretrain: model.encoder.load_pretrained_parameters(args) model.train( ids_corpus, train, (dev, dev_raw) if args.dev else None, (test, test_raw) if args.test else None )
def create_embedding_layer(emb_filename, n_d=100, vocab_dict=None, unk="<unk>", padding="<padding>", fix_init_embs=True): embs = load_embedding_iterator(emb_filename, vocab_dict, skip_head=True) if emb_filename else None embedding_layer = EmbeddingLayer(n_d=n_d, vocab=[padding, unk] + (vocab_dict.keys() if not embs else []), embs=embs, fix_init_embs=fix_init_embs) return embedding_layer
def main(args): assert args.train, "Training set required" assert args.dev, "Dev set required" assert args.test, "Test set required" assert args.emb, "Pre-trained word embeddings required." assert args.aspect_seeds, "Aspect seeds required." print args seeds = load_lis(args.aspect_seeds) say("loaded {} aspect seeds\n".format(len(seeds))) embedding_layer = EmbeddingLayer( n_d = 100, vocab = [ "<unk>" ], pre_embs = load_embedding_iterator(args.emb), ) seeds_id = np.array(map(lambda seed: embedding_layer.map_to_ids(seed.strip().split()).tolist(), seeds), dtype = np.int32) if args.train: train_x, train_y = load_doc_corpus(embedding_layer, args.train) if args.dev: dev_x, dev_y = load_doc_corpus(embedding_layer, args.dev) if args.test: test_x, test_y = load_doc_corpus(embedding_layer, args.test) if args.train: model = Model( args = args, embedding_layer = embedding_layer, num_aspects = len(seeds_id), query = seeds_id ) if args.load: print 'loading model...' model.load_model(args.load) else: model.ready() print 'training...' model.train( (train_x, train_y), (dev_x, dev_y) if args.dev else None, (test_x, test_y) if args.test else None )
def main(args): print(args) model = None assert args.embedding, "Pre-trained word embeddings required." embedding_layer = EmbeddingLayer( n_d = args.hidden_dim, vocab = [ "<unk>" ], embs = load_embedding_iterator(args.embedding) ) if args.train: train_x, train_y = read_corpus(args.train) train_x = [ embedding_layer.map_to_ids(x) for x in train_x ] if args.dev: dev_x, dev_y = read_corpus(args.dev) dev_x = [ embedding_layer.map_to_ids(x) for x in dev_x ] if args.test: test_x, test_y = read_corpus(args.test) test_x = [ embedding_layer.map_to_ids(x) for x in test_x ] if args.train: model = Model( args = args, embedding_layer = embedding_layer, nclasses = max(train_y)+1 ) model.ready() model.train( (train_x, train_y), (dev_x, dev_y) if args.dev else None, (test_x, test_y) if args.test else None, ) if args.load and args.test and not args.train: # model.args and model.nclasses will be loaded from file model = Model( args = None, embedding_layer = embedding_layer, nclasses = -1 ) model.load_model(args.load) accuracy = model.evaluate_set(test_x, test_y) print accuracy
def main(args): print args model = None assert args.embedding, "Pre-trained word embeddings required." embedding_layer = EmbeddingLayer( n_d = args.hidden_dim, vocab = [ "<unk>" ], embs = load_embedding_iterator(args.embedding) ) if args.train: train_x, train_y = read_corpus(args.train) train_x = [ embedding_layer.map_to_ids(x) for x in train_x ] if args.dev: dev_x, dev_y = read_corpus(args.dev) dev_x = [ embedding_layer.map_to_ids(x) for x in dev_x ] if args.test: test_x, test_y = read_corpus(args.test) test_x = [ embedding_layer.map_to_ids(x) for x in test_x ] if args.train: model = Model( args = args, embedding_layer = embedding_layer, nclasses = max(train_y)+1 ) model.ready() model.train( (train_x, train_y), (dev_x, dev_y) if args.dev else None, (test_x, test_y) if args.test else None, ) if args.load and args.test and not args.train: # model.args and model.nclasses will be loaded from file model = Model( args = None, embedding_layer = embedding_layer, nclasses = -1 ) model.load_model(args.load) accuracy = model.evaluate_set(test_x, test_y) print accuracy
def __init__(self, model_path, corpus_path, emb_path, session, layer='lstm'): raw_corpus = myio.read_corpus(corpus_path) embedding_layer = create_embedding_layer(n_d=10, embs=load_embedding_iterator( args.embeddings), only_words=False) # weights = myio.create_idf_weights(corpus_path, embedding_layer) # todo say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus))) if layer.lower() == "lstm": from models import LstmQR as Model elif layer.lower() in ["bilstm", "bigru"]: from models import BiRNNQR as Model elif layer.lower() == "cnn": from models import CnnQR as Model elif layer.lower() == "gru": from models import GruQR as Model model = Model(args={"layer": args.layer}, embedding_layer=embedding_layer, weights=None) model.load_n_set_model(model_path, session) say("model initialized\n") self.model = model def score_func(titles, bodies, cur_sess): feed_dict = { self.model.titles_words_ids_placeholder: titles.T, # IT IS TRANSPOSE ;) self.model.bodies_words_ids_placeholder: bodies.T, # IT IS TRANSPOSE ;) self.model.dropout_prob: 0., } _scores = cur_sess.run(self.model.scores, feed_dict) return _scores self.score_func = score_func say("scoring function compiled\n")
def main(args): raw_corpus = myio.read_corpus(args.corpus) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d=args.hidden_dim, cut_off=args.cut_off, embs=load_embedding_iterator(args.embeddings) if args.embeddings else None) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer) say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus))) padding_id = embedding_layer.vocab_map["<padding>"] bos_id = embedding_layer.vocab_map["<s>"] eos_id = embedding_layer.vocab_map["</s>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.dev: dev = myio.read_annotations(args.dev, K_neg=20, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev, padding_id) if args.test: test = myio.read_annotations(args.test, K_neg=20, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test, padding_id) if args.heldout: with open(args.heldout) as fin: heldout_ids = fin.read().split() heldout_corpus = dict( (id, ids_corpus[id]) for id in heldout_ids if id in ids_corpus) train_corpus = dict((id, ids_corpus[id]) for id in ids_corpus if id not in heldout_corpus) heldout = myio.create_batches(heldout_corpus, [], args.batch_size, padding_id, bos_id, eos_id, auto_encode=True) heldout = [ myio.create_one_batch(b1, t2, padding_id) for t1, b1, t2 in heldout ] say("heldout examples={}\n".format(len(heldout_corpus))) if args.train: model = Model(args, embedding_layer, weights=weights if args.reweight else None) start_time = time.time() train = myio.read_annotations(args.train) if not args.use_anno: train = [] train_batches = myio.create_batches(ids_corpus, train, args.batch_size, model.padding_id, model.bos_id, model.eos_id, auto_encode=True) say("{} to create batches\n".format(time.time() - start_time)) model.ready() model.train(ids_corpus if not args.heldout else train_corpus, train, dev if args.dev else None, test if args.test else None, heldout if args.heldout else None)
def main(): print 'Starting at: {}\n'.format(datetime.now()) raw_corpus = qaio.read_corpus(args.corpus) embedding_layer = create_embedding_layer( n_d=200, embs=load_embedding_iterator(args.embeddings), only_words=False if args.use_embeddings else True, trainable=args.trainable) print("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus))) if args.reweight: weights = qaio.create_idf_weights(args.corpus, embedding_layer) label_tags = pickle.load(open(args.tags_file, 'rb')) print '\nloaded {} tags'.format(len(label_tags)) raw_corpus_tags = tpio.read_corpus(args.corpus_w_tags, with_tags=True) ids_corpus_tags = tpio.map_corpus(raw_corpus_tags, embedding_layer, label_tags, max_len=args.max_seq_len) padding_id = embedding_layer.vocab_map["<padding>"] if args.dev: dev = qaio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus_tags, dev, padding_id, N_neg=args.n_neg, samples_file=args.samples_file) if args.test: test = qaio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus_tags, test, padding_id, N_neg=args.n_neg, samples_file=args.samples_file) if args.train: train = qaio.read_annotations(args.train) if args.layer.lower() == "lstm": from models import LstmQRTP as Model elif args.layer.lower() in ["bilstm", "bigru"]: from models import BiRNNQRTP as Model elif args.layer.lower() == "cnn": from models import CnnQRTP as Model elif args.layer.lower() == "gru": from models import GruQRTP as Model else: raise Exception("no correct layer given") model = Model(args, embedding_layer, len(label_tags), weights=weights if args.reweight else None) model.ready() print 'total (non) trainable params: ', model.num_parameters() if args.load_pre_trained_part: # need to remove the old assigns to embeddings model.init_assign_ops = model.load_pre_trained_part( args.load_pre_trained_part) print '\nmodel init_assign_ops: {}\n'.format(model.init_assign_ops) model.train_model(ids_corpus_tags, train, dev=dev if args.dev else None, test=test if args.test else None) print '\nEnded at: {}'.format(datetime.now())
train_cors = train_processor.loadSrc() test_cors = test_processor.loadSrc() print 'Constructing word and character list...' word_lis = words_load(train_cors, test_cors) char_lis = chars_load(word_lis) char_lis.append('<unk>') rel_lis = rels_load(train_cors, test_cors) rel_lis.append('<unk>') print 'Find ' + str(len(word_lis)) + ' unique words!' print 'Find ' + str(len(char_lis)) + ' unique chars!' print 'Find ' + str(len(rel_lis)) + ' unique dep relations!' word_embedding_layer = EmbeddingLayer(n_d=args.word_dim, vocab=['<unk>'], embs=load_embedding_iterator( args.pre_emb), fix_init_embs=False) char_embedding_layer = EmbeddingLayer(n_d=args.char_dim, vocab=char_lis, fix_init_embs=False) rel_embedding_layers = [] rel_matrix_layers = [] for i in range(args.clayer): rel_embedding_layers.append( EmbeddingLayer(n_d=args.word_dim, vocab=rel_lis, fix_init_embs=False)) if args.model == 4: for i in range(args.clayer): rel_matrix_layers.append(
def main(): print 'Starting at: {}\n'.format(datetime.now()) raw_corpus = myio.read_corpus(args.corpus) embedding_layer = create_embedding_layer( n_d=200, embs=load_embedding_iterator(args.embeddings), only_words=False if args.use_embeddings else True, trainable=args.trainable ) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, max_len=args.max_seq_len) print("vocab size={}, corpus size={}\n".format( embedding_layer.n_V, len(raw_corpus) )) padding_id = embedding_layer.vocab_map["<padding>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.layer.lower() == "lstm": from models import LstmQR as Model elif args.layer.lower() in ["bilstm", "bigru"]: from models import BiRNNQR as Model elif args.layer.lower() == "cnn": from models import CnnQR as Model elif args.layer.lower() == "gru": from models import GruQR as Model else: raise Exception("no correct layer given") if args.dev: dev = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev, padding_id, pad_left=False) if args.test: test = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test, padding_id, pad_left=False) model = Model(args, embedding_layer, weights=weights if args.reweight else None) model.ready() print 'total (non) trainable params: ', model.num_parameters() if args.load_pre_trained_part: # need to remove the old assigns to embeddings model.init_assign_ops = model.load_pre_trained_part(args.load_pre_trained_part) print '\nmodel init_assign_ops: {}\n'.format(model.init_assign_ops) if args.train: start_time = time.time() train = myio.read_annotations(args.train) train_batches = myio.create_batches( ids_corpus, train, args.batch_size, padding_id, pad_left=False ) print("{} to create batches\n".format(time.time()-start_time)) print("{} batches, {} tokens in total, {} triples in total\n".format( len(train_batches), sum(len(x[0].ravel())+len(x[1].ravel()) for x in train_batches), sum(len(x[2].ravel()) for x in train_batches) )) model.train_model( ids_corpus, train, dev=dev if args.dev else None, test=test if args.test else None ) print '\nEnded at: {}'.format(datetime.now())
def main(args): raw_corpus = myio.read_corpus(args.corpus, args.translations or None, args.translatable_ids or None, args.generated_questions_train or None) generated_questions_eval = myio.read_generated_questions( args.generated_questions) embedding_layer = None if args.trainable_embeddings == 1: embedding_layer = myio.create_embedding_layer( raw_corpus, n_d=args.hidden_dim, cut_off=args.cut_off, embs=load_embedding_iterator(args.embeddings) if args.embeddings else None, fix_init_embs=False) else: embedding_layer = myio.create_embedding_layer( raw_corpus, n_d=args.hidden_dim, cut_off=args.cut_off, embs=load_embedding_iterator(args.embeddings) if args.embeddings else None) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, max_len=args.max_seq_len, generated_questions=generated_questions_eval) say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus))) padding_id = embedding_layer.vocab_map["<padding>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.dev: # dev = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1) dev = myio.read_annotations(args.dev, K_neg=args.dev_pool_size, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev, padding_id, pad_left=not args.average) if args.test: test = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test, padding_id, pad_left=not args.average) if args.train: start_time = time.time() train = myio.read_annotations( args.train, training_data_percent=args.training_data_percent) train_batches = myio.create_batches(ids_corpus, train, args.batch_size, padding_id, pad_left=not args.average, include_generated_questions=True) say("{} to create batches\n".format(time.time() - start_time)) say("{} batches, {} tokens in total, {} triples in total\n".format( len(train_batches), sum(len(x[0].ravel()) + len(x[1].ravel()) for x in train_batches), sum(len(x[2].ravel()) for x in train_batches))) train_batches = None model = Model(args, embedding_layer, weights=weights if args.reweight else None) # print('args.average: '+args.average) model.ready() # # # set parameters using pre-trained network if args.do_train == 1: if args.load_pretrain: model.load_pretrained_parameters(args) model.train(ids_corpus, train, dev if args.dev else None, test if args.test else None) # AVERAGE THE PREDICTIONS OBTAINED BY RUNNING THE MODEL 10 TIMES if args.do_evaluate == 1: model.load_pretrained_parameters(args) # model.set_model(model.load_model(args.load_pretrain)) for i in range(1): r = model.just_eval(dev if args.dev else None, test if args.test else None) # ANALYZE the results if len(args.analyze_file.strip()) > 0: model.load_pretrained_parameters(args) file_name = args.analyze_file.strip( ) # 'AskUbuntu.Rcnn_analysis3.gt(es)-gt.txt' model.analyze(file_name, embedding_layer, dev)
default="") # to write in argparser.add_argument("--results_file", type=str, default="") # to write in args = argparser.parse_args() print '\n', args, '\n' df = read_df(args.df_corpus) df = df.fillna(u'') label_tags = pickle.load(open(args.tags_file, 'rb')) raw_corpus = myio.read_corpus(args.corpus_w_tags, with_tags=True) embedding_layer = create_embedding_layer(n_d=10, embs=load_embedding_iterator( args.embeddings), only_words=False) with tf.Session() as sess: myqrapi = TPAPI(args.model, embedding_layer, sess, len(label_tags), args.layer) embedding_layer = myqrapi.model.embedding_layer ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, label_tags, max_len=args.max_seq_len) print("vocab size={}, corpus size={}\n".format(embedding_layer.n_V,
def main(args): print(args) model = None assert args.embedding, "Pre-trained word embeddings required." embedding_layer = EmbeddingLayer( n_d=args.hidden_dim, vocab=["<unk>"], embs=load_embedding_iterator(args.embedding), fix_init_embs=args.fix_emb ) user_embedding_layer = None fix_user_embs = True vocab_lis = ["<unk>"] if args.user_embs: user_embedding_layer = EmbeddingLayer( n_d=args.hidden_dim, vocab=vocab_lis, embs=load_embedding_iterator(args.user_embs), fix_init_embs=fix_user_embs, norm=args.norm_user_embs ) n_d = user_embedding_layer.n_d else: user_embedding_layer = EmbeddingLayer( n_d=n_d, vocab=vocab_lis, embs=None, fix_init_embs=fix_user_embs, norm=args.norm_user_embs ) if args.train: train_x, train_y, train_usr = load_doc_corpus( embedding_layer, user_embedding_layer, args.train, hieri=args.hierarchical) if args.dev: dev_x, dev_y, dev_usr = load_doc_corpus( embedding_layer, user_embedding_layer, args.dev, hieri=args.hierarchical) if args.test: test_x, test_y, test_usr = load_doc_corpus( embedding_layer, user_embedding_layer, args.test, hieri=args.hierarchical) if args.train: model = Model( args=args, embedding_layer=embedding_layer, nclasses=max(train_y) + 1, user_embedding_layer=user_embedding_layer, ) if args.load: print('loading model...') model.load_model(args.load) else: model.ready() print('training...') model.train( (train_x, train_y, train_usr), (dev_x, dev_y, dev_usr) if args.dev else None, (test_x, test_y, test_usr) if args.test else None, ) if args.load and args.test and not args.train: # model.args and model.nclasses will be loaded from file model = Model( args=None, embedding_layer=embedding_layer, nclasses=-1 ) model.load_model(args.load) accuracy = model.evaluate_set(test_x, test_y) print(accuracy)