Example #1
0
File: api.py Project: Sundayxr/rcnn
    def __init__(self, model_path, corpus_path, emb_path):
        raw_corpus = myio.read_corpus(corpus_path)
        embedding_layer = myio.create_embedding_layer(
                    raw_corpus,
                    n_d = 10,
                    cut_off = 1,
                    embs = load_embedding_iterator(emb_path)
                )
        weights = myio.create_idf_weights(corpus_path, embedding_layer)
        say("vocab size={}, corpus size={}\n".format(
                embedding_layer.n_V,
                len(raw_corpus)
            ))

        model = Model(args=None, embedding_layer=embedding_layer,
                    weights=weights)

        model_data = model.load_model(model_path)
        model.set_model(model_data)
        model.dropout.set_value(0.0)
        say("model initialized\n")

        score_func = theano.function(
                inputs = [ model.idts, model.idbs ],
                outputs = model.scores,
                on_unused_input='ignore'
            )
        self.model = model
        self.score_func = score_func
        say("scoring function compiled\n")
Example #2
0
def main():
    print 'Starting at: {}\n'.format(datetime.now())
    s_time = time.time()
    df = read_df(args.df_path)
    df = df.fillna(u'')

    label_tags = pickle.load(open(args.tags_file, 'rb'))
    print '\nloaded {} tags'.format(len(label_tags))

    raw_corpus = myio.read_corpus(args.corpus_w_tags, with_tags=True)

    embedding_layer = create_embedding_layer(
        n_d=200,
        embs=load_embedding_iterator(args.embeddings),
        only_words=False if args.use_embeddings else True,
        # only_words will take the words from embedding file and make random initial embeddings
        trainable=args.trainable
    )

    ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, label_tags, max_len=args.max_seq_len)

    print("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus)))

    padding_id = embedding_layer.vocab_map["<padding>"]

    if args.reweight:
        weights = myio.create_idf_weights(args.corpus_w_tags, embedding_layer, with_tags=True)

    if args.layer.lower() == "lstm":
        from models import LstmMultiTagsClassifier as Model
    elif args.layer.lower() in ["bilstm", "bigru"]:
        from models import BiRNNMultiTagsClassifier as Model
    elif args.layer.lower() == "cnn":
        from models import CnnMultiTagsClassifier as Model
    elif args.layer.lower() == "gru":
        from models import GruMultiTagsClassifier as Model
    else:
        raise Exception("no correct layer given")

    if args.cross_val:
        train, dev, test = myio.create_cross_val_batches(df, ids_corpus, args.batch_size, padding_id)
    else:
        dev = list(myio.create_batches(
            df, ids_corpus, 'dev', args.batch_size, padding_id, N_neg=args.n_neg, samples_file=args.samples_file))
        test = list(myio.create_batches(
            df, ids_corpus, 'test', args.batch_size, padding_id, N_neg=args.n_neg, samples_file=args.samples_file))
    # baselines_eval(train, dev, test)

    model = Model(args, embedding_layer, len(label_tags), weights=weights if args.reweight else None)
    model.ready()

    print 'total (non) trainable params: ', model.num_parameters()

    if args.load_pre_trained_part:
        # need to remove the old assigns to embeddings
        model.init_assign_ops = model.load_pre_trained_part(args.load_pre_trained_part)
    print '\nmodel init_assign_ops: {}\n'.format(model.init_assign_ops)

    model.train_model(df, ids_corpus, dev=dev, test=test)
    print '\nEnded at: {}'.format(datetime.now())
Example #3
0
    def __init__(self, model_path, corpus_path, emb_path):
        raw_corpus = myio.read_corpus(corpus_path)
        embedding_layer = myio.create_embedding_layer(
            raw_corpus,
            n_d=10,
            cut_off=1,
            embs=load_embedding_iterator(emb_path))
        weights = myio.create_idf_weights(corpus_path, embedding_layer)
        say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V,
                                                     len(raw_corpus)))

        model = Model(args=None,
                      embedding_layer=embedding_layer,
                      weights=weights)

        model_data = model.load_model(model_path)
        model.set_model(model_data)
        model.dropout.set_value(0.0)
        say("model initialized\n")

        score_func = theano.function(inputs=[model.idts, model.idbs],
                                     outputs=model.scores,
                                     on_unused_input='ignore')
        self.model = model
        self.score_func = score_func
        say("scoring function compiled\n")
Example #4
0
def main(args):
    raw_corpus = myio.read_corpus(args.corpus)
    embedding_layer = myio.create_embedding_layer(
        raw_corpus,
        n_d=args.hidden_dim,
        embs=load_embedding_iterator(args.embeddings)
        if args.embeddings else None)
    ids_corpus = myio.map_corpus(raw_corpus,
                                 embedding_layer,
                                 max_len=args.max_seq_len)
    say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V,
                                                 len(raw_corpus)))
    padding_id = embedding_layer.vocab_map["<padding>"]

    if args.reweight:
        weights = myio.create_idf_weights(args.corpus, embedding_layer)

    if args.dev:
        dev_raw = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1)
        dev = myio.create_eval_batches(ids_corpus,
                                       dev_raw,
                                       padding_id,
                                       pad_left=not args.average,
                                       merge=args.merge)
    if args.test:
        test_raw = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1)
        test = myio.create_eval_batches(ids_corpus,
                                        test_raw,
                                        padding_id,
                                        pad_left=not args.average,
                                        merge=args.merge)

    if args.train:
        start_time = time.time()
        train = myio.read_annotations(args.train)
        train_batches = myio.create_batches(ids_corpus,
                                            train,
                                            args.batch_size,
                                            padding_id,
                                            pad_left=not args.average,
                                            merge=args.merge)
        say("{} to create batches\n".format(time.time() - start_time))
        say("{} batches, {} tokens in total, {} triples in total\n".format(
            len(train_batches), sum(len(x[0].ravel()) for x in train_batches),
            sum(len(x[1].ravel()) for x in train_batches)))
        train_batches = None

        model = Model(args,
                      embedding_layer,
                      weights=weights if args.reweight else None)
        model.ready()

        # set parameters using pre-trained network
        if args.load_pretrain:
            model.encoder.load_pretrained_parameters(args)

        model.train(ids_corpus, train, (dev, dev_raw) if args.dev else None,
                    (test, test_raw) if args.test else None)
Example #5
0
def main(args):
    raw_corpus = myio.read_corpus(args.corpus)
    embedding_layer = myio.create_embedding_layer(
                raw_corpus,
                n_d = args.hidden_dim,
                cut_off = args.cut_off,
                embs = load_embedding_iterator(args.embeddings) if args.embeddings else None
            )
    ids_corpus = myio.map_corpus(raw_corpus, embedding_layer)
    say("vocab size={}, corpus size={}\n".format(
            embedding_layer.n_V,
            len(raw_corpus)
        ))
    padding_id = embedding_layer.vocab_map["<padding>"]
    bos_id = embedding_layer.vocab_map["<s>"]
    eos_id = embedding_layer.vocab_map["</s>"]

    if args.reweight:
        weights = myio.create_idf_weights(args.corpus, embedding_layer)

    if args.dev:
        dev = myio.read_annotations(args.dev, K_neg=20, prune_pos_cnt=-1)
        dev = myio.create_eval_batches(ids_corpus, dev, padding_id)
    if args.test:
        test = myio.read_annotations(args.test, K_neg=20, prune_pos_cnt=-1)
        test = myio.create_eval_batches(ids_corpus, test, padding_id)

    if args.heldout:
        with open(args.heldout) as fin:
            heldout_ids = fin.read().split()
        heldout_corpus = dict((id, ids_corpus[id]) for id in heldout_ids if id in ids_corpus)
        train_corpus = dict((id, ids_corpus[id]) for id in ids_corpus
                                                if id not in heldout_corpus)
        heldout = myio.create_batches(heldout_corpus, [ ], args.batch_size,
                    padding_id, bos_id, eos_id, auto_encode=True)
        heldout = [ myio.create_one_batch(b1, t2, padding_id) for t1, b1, t2 in heldout ]
        say("heldout examples={}\n".format(len(heldout_corpus)))

    if args.train:
        model = Model(args, embedding_layer,
                      weights=weights if args.reweight else None)

        start_time = time.time()
        train = myio.read_annotations(args.train)
        if not args.use_anno: train = [ ]
        train_batches = myio.create_batches(ids_corpus, train, args.batch_size,
                    model.padding_id, model.bos_id, model.eos_id, auto_encode=True)
        say("{} to create batches\n".format(time.time()-start_time))

        model.ready()
        model.train(
                ids_corpus if not args.heldout else train_corpus,
                train,
                dev if args.dev else None,
                test if args.test else None,
                heldout if args.heldout else None
            )
Example #6
0
def create_embedding_layer(path):
    embedding_layer = EmbeddingLayer(
        n_d=200,
        vocab=["<unk>", "<padding>"],
        embs=load_embedding_iterator(path),
        oov="<unk>",
        #fix_init_embs = True
        fix_init_embs=False)
    return embedding_layer
Example #7
0
def create_embedding_layer(path):
    embedding_layer = EmbeddingLayer(
            n_d = 200,
            vocab = [ "<unk>", "<padding>" ],
            embs = load_embedding_iterator(path),
            oov = "<unk>",
            #fix_init_embs = True
            fix_init_embs = False
        )
    return embedding_layer
Example #8
0
def main(args):
    raw_corpus = myio.read_corpus(args.corpus)
    embedding_layer = myio.create_embedding_layer(
                raw_corpus,
                n_d = args.hidden_dim,
                embs = load_embedding_iterator(args.embeddings) if args.embeddings else None
            )
    ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, max_len=args.max_seq_len)
    say("vocab size={}, corpus size={}\n".format(
            embedding_layer.n_V,
            len(raw_corpus)
        ))
    padding_id = embedding_layer.vocab_map["<padding>"]

    if args.reweight:
        weights = myio.create_idf_weights(args.corpus, embedding_layer)

    if args.dev:
        dev_raw = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1)
        dev = myio.create_eval_batches(ids_corpus, dev_raw, padding_id,
                    pad_left=not args.average, merge=args.merge)
    if args.test:
        test_raw = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1)
        test = myio.create_eval_batches(ids_corpus, test_raw, padding_id,
                    pad_left=not args.average, merge=args.merge)

    if args.train:
        start_time = time.time()
        train = myio.read_annotations(args.train)
        train_batches = myio.create_batches(ids_corpus, train, args.batch_size,
                                padding_id, pad_left = not args.average, merge=args.merge)
        say("{} to create batches\n".format(time.time()-start_time))
        say("{} batches, {} tokens in total, {} triples in total\n".format(
                len(train_batches),
                sum(len(x[0].ravel()) for x in train_batches),
                sum(len(x[1].ravel()) for x in train_batches)
            ))
        train_batches = None

        model = Model(args, embedding_layer,
                      weights=weights if args.reweight else None)
        model.ready()

        # set parameters using pre-trained network
        if args.load_pretrain:
            model.encoder.load_pretrained_parameters(args)

        model.train(
                ids_corpus,
                train,
                (dev, dev_raw) if args.dev else None,
                (test, test_raw) if args.test else None
            )
Example #9
0
def create_embedding_layer(emb_filename,
                           n_d=100,
                           vocab_dict=None,
                           unk="<unk>",
                           padding="<padding>",
                           fix_init_embs=True):
    embs = load_embedding_iterator(emb_filename, vocab_dict,
                                   skip_head=True) if emb_filename else None
    embedding_layer = EmbeddingLayer(n_d=n_d,
                                     vocab=[padding, unk] +
                                     (vocab_dict.keys() if not embs else []),
                                     embs=embs,
                                     fix_init_embs=fix_init_embs)
    return embedding_layer
Example #10
0
def main(args):
    assert args.train, "Training  set required"
    assert args.dev, "Dev set required"
    assert args.test, "Test set required"
    assert args.emb, "Pre-trained word embeddings required."
    assert args.aspect_seeds, "Aspect seeds required."
	
    print args

    seeds = load_lis(args.aspect_seeds)
    say("loaded {} aspect seeds\n".format(len(seeds)))

    embedding_layer = EmbeddingLayer(
                n_d = 100,
                vocab = [ "<unk>" ],
                pre_embs = load_embedding_iterator(args.emb),
            )

    seeds_id = np.array(map(lambda seed: embedding_layer.map_to_ids(seed.strip().split()).tolist(), seeds), dtype = np.int32)

    if args.train:
	train_x, train_y = load_doc_corpus(embedding_layer, args.train)

    if args.dev:
	dev_x, dev_y = load_doc_corpus(embedding_layer, args.dev)

    if args.test:
	test_x, test_y = load_doc_corpus(embedding_layer, args.test)
    
    if args.train:
        model = Model(
                    args = args,
                    embedding_layer = embedding_layer,
                    num_aspects = len(seeds_id),
		    query = seeds_id
            )
	if args.load:
	    print 'loading model...'
	    model.load_model(args.load)
        else:
	    model.ready()
	
	print 'training...'
        model.train(
                (train_x, train_y),
                (dev_x, dev_y) if args.dev else None,
                (test_x, test_y) if args.test else None
            )
Example #11
0
def main(args):
    print(args)

    model = None

    assert args.embedding, "Pre-trained word embeddings required."

    embedding_layer = EmbeddingLayer(
                n_d = args.hidden_dim,
                vocab = [ "<unk>" ],
                embs = load_embedding_iterator(args.embedding)
            )

    if args.train:
        train_x, train_y = read_corpus(args.train)
        train_x = [ embedding_layer.map_to_ids(x) for x in train_x ]

    if args.dev:
        dev_x, dev_y = read_corpus(args.dev)
        dev_x = [ embedding_layer.map_to_ids(x) for x in dev_x ]

    if args.test:
        test_x, test_y = read_corpus(args.test)
        test_x = [ embedding_layer.map_to_ids(x) for x in test_x ]

    if args.train:
        model = Model(
                    args = args,
                    embedding_layer = embedding_layer,
                    nclasses = max(train_y)+1
            )
        model.ready()
        model.train(
                (train_x, train_y),
                (dev_x, dev_y) if args.dev else None,
                (test_x, test_y) if args.test else None,
            )

    if args.load and args.test and not args.train:
        # model.args and model.nclasses will be loaded from file
        model = Model(
                    args = None,
                    embedding_layer = embedding_layer,
                    nclasses = -1
            )
        model.load_model(args.load)
        accuracy = model.evaluate_set(test_x, test_y)
        print accuracy
Example #12
0
def main(args):
    print args

    model = None

    assert args.embedding, "Pre-trained word embeddings required."

    embedding_layer = EmbeddingLayer(
                n_d = args.hidden_dim,
                vocab = [ "<unk>" ],
                embs = load_embedding_iterator(args.embedding)
            )

    if args.train:
        train_x, train_y = read_corpus(args.train)
        train_x = [ embedding_layer.map_to_ids(x) for x in train_x ]

    if args.dev:
        dev_x, dev_y = read_corpus(args.dev)
        dev_x = [ embedding_layer.map_to_ids(x) for x in dev_x ]

    if args.test:
        test_x, test_y = read_corpus(args.test)
        test_x = [ embedding_layer.map_to_ids(x) for x in test_x ]

    if args.train:
        model = Model(
                    args = args,
                    embedding_layer = embedding_layer,
                    nclasses = max(train_y)+1
            )
        model.ready()
        model.train(
                (train_x, train_y),
                (dev_x, dev_y) if args.dev else None,
                (test_x, test_y) if args.test else None,
            )

    if args.load and args.test and not args.train:
        # model.args and model.nclasses will be loaded from file
        model = Model(
                    args = None,
                    embedding_layer = embedding_layer,
                    nclasses = -1
            )
        model.load_model(args.load)
        accuracy = model.evaluate_set(test_x, test_y)
        print accuracy
Example #13
0
    def __init__(self,
                 model_path,
                 corpus_path,
                 emb_path,
                 session,
                 layer='lstm'):
        raw_corpus = myio.read_corpus(corpus_path)
        embedding_layer = create_embedding_layer(n_d=10,
                                                 embs=load_embedding_iterator(
                                                     args.embeddings),
                                                 only_words=False)
        # weights = myio.create_idf_weights(corpus_path, embedding_layer) # todo
        say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V,
                                                     len(raw_corpus)))

        if layer.lower() == "lstm":
            from models import LstmQR as Model
        elif layer.lower() in ["bilstm", "bigru"]:
            from models import BiRNNQR as Model
        elif layer.lower() == "cnn":
            from models import CnnQR as Model
        elif layer.lower() == "gru":
            from models import GruQR as Model

        model = Model(args={"layer": args.layer},
                      embedding_layer=embedding_layer,
                      weights=None)

        model.load_n_set_model(model_path, session)
        say("model initialized\n")

        self.model = model

        def score_func(titles, bodies, cur_sess):
            feed_dict = {
                self.model.titles_words_ids_placeholder:
                titles.T,  # IT IS TRANSPOSE ;)
                self.model.bodies_words_ids_placeholder:
                bodies.T,  # IT IS TRANSPOSE ;)
                self.model.dropout_prob: 0.,
            }
            _scores = cur_sess.run(self.model.scores, feed_dict)
            return _scores

        self.score_func = score_func
        say("scoring function compiled\n")
Example #14
0
def main(args):
    raw_corpus = myio.read_corpus(args.corpus)
    embedding_layer = myio.create_embedding_layer(
        raw_corpus,
        n_d=args.hidden_dim,
        cut_off=args.cut_off,
        embs=load_embedding_iterator(args.embeddings)
        if args.embeddings else None)
    ids_corpus = myio.map_corpus(raw_corpus, embedding_layer)
    say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V,
                                                 len(raw_corpus)))
    padding_id = embedding_layer.vocab_map["<padding>"]
    bos_id = embedding_layer.vocab_map["<s>"]
    eos_id = embedding_layer.vocab_map["</s>"]

    if args.reweight:
        weights = myio.create_idf_weights(args.corpus, embedding_layer)

    if args.dev:
        dev = myio.read_annotations(args.dev, K_neg=20, prune_pos_cnt=-1)
        dev = myio.create_eval_batches(ids_corpus, dev, padding_id)
    if args.test:
        test = myio.read_annotations(args.test, K_neg=20, prune_pos_cnt=-1)
        test = myio.create_eval_batches(ids_corpus, test, padding_id)

    if args.heldout:
        with open(args.heldout) as fin:
            heldout_ids = fin.read().split()
        heldout_corpus = dict(
            (id, ids_corpus[id]) for id in heldout_ids if id in ids_corpus)
        train_corpus = dict((id, ids_corpus[id]) for id in ids_corpus
                            if id not in heldout_corpus)
        heldout = myio.create_batches(heldout_corpus, [],
                                      args.batch_size,
                                      padding_id,
                                      bos_id,
                                      eos_id,
                                      auto_encode=True)
        heldout = [
            myio.create_one_batch(b1, t2, padding_id) for t1, b1, t2 in heldout
        ]
        say("heldout examples={}\n".format(len(heldout_corpus)))

    if args.train:
        model = Model(args,
                      embedding_layer,
                      weights=weights if args.reweight else None)

        start_time = time.time()
        train = myio.read_annotations(args.train)
        if not args.use_anno: train = []
        train_batches = myio.create_batches(ids_corpus,
                                            train,
                                            args.batch_size,
                                            model.padding_id,
                                            model.bos_id,
                                            model.eos_id,
                                            auto_encode=True)
        say("{} to create batches\n".format(time.time() - start_time))
        model.ready()

        model.train(ids_corpus if not args.heldout else train_corpus, train,
                    dev if args.dev else None, test if args.test else None,
                    heldout if args.heldout else None)
Example #15
0
def main():
    print 'Starting at: {}\n'.format(datetime.now())
    raw_corpus = qaio.read_corpus(args.corpus)
    embedding_layer = create_embedding_layer(
        n_d=200,
        embs=load_embedding_iterator(args.embeddings),
        only_words=False if args.use_embeddings else True,
        trainable=args.trainable)
    print("vocab size={}, corpus size={}\n".format(embedding_layer.n_V,
                                                   len(raw_corpus)))
    if args.reweight:
        weights = qaio.create_idf_weights(args.corpus, embedding_layer)

    label_tags = pickle.load(open(args.tags_file, 'rb'))
    print '\nloaded {} tags'.format(len(label_tags))

    raw_corpus_tags = tpio.read_corpus(args.corpus_w_tags, with_tags=True)
    ids_corpus_tags = tpio.map_corpus(raw_corpus_tags,
                                      embedding_layer,
                                      label_tags,
                                      max_len=args.max_seq_len)

    padding_id = embedding_layer.vocab_map["<padding>"]

    if args.dev:
        dev = qaio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1)
        dev = myio.create_eval_batches(ids_corpus_tags,
                                       dev,
                                       padding_id,
                                       N_neg=args.n_neg,
                                       samples_file=args.samples_file)

    if args.test:
        test = qaio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1)
        test = myio.create_eval_batches(ids_corpus_tags,
                                        test,
                                        padding_id,
                                        N_neg=args.n_neg,
                                        samples_file=args.samples_file)

    if args.train:
        train = qaio.read_annotations(args.train)

        if args.layer.lower() == "lstm":
            from models import LstmQRTP as Model
        elif args.layer.lower() in ["bilstm", "bigru"]:
            from models import BiRNNQRTP as Model
        elif args.layer.lower() == "cnn":
            from models import CnnQRTP as Model
        elif args.layer.lower() == "gru":
            from models import GruQRTP as Model
        else:
            raise Exception("no correct layer given")

        model = Model(args,
                      embedding_layer,
                      len(label_tags),
                      weights=weights if args.reweight else None)
        model.ready()
        print 'total (non) trainable params: ', model.num_parameters()

        if args.load_pre_trained_part:
            # need to remove the old assigns to embeddings
            model.init_assign_ops = model.load_pre_trained_part(
                args.load_pre_trained_part)
        print '\nmodel init_assign_ops: {}\n'.format(model.init_assign_ops)

        model.train_model(ids_corpus_tags,
                          train,
                          dev=dev if args.dev else None,
                          test=test if args.test else None)
    print '\nEnded at: {}'.format(datetime.now())
Example #16
0
train_cors = train_processor.loadSrc()
test_cors = test_processor.loadSrc()

print 'Constructing word and character list...'
word_lis = words_load(train_cors, test_cors)
char_lis = chars_load(word_lis)
char_lis.append('<unk>')
rel_lis = rels_load(train_cors, test_cors)
rel_lis.append('<unk>')
print 'Find ' + str(len(word_lis)) + ' unique words!'
print 'Find ' + str(len(char_lis)) + ' unique chars!'
print 'Find ' + str(len(rel_lis)) + ' unique dep relations!'

word_embedding_layer = EmbeddingLayer(n_d=args.word_dim,
                                      vocab=['<unk>'],
                                      embs=load_embedding_iterator(
                                          args.pre_emb),
                                      fix_init_embs=False)

char_embedding_layer = EmbeddingLayer(n_d=args.char_dim,
                                      vocab=char_lis,
                                      fix_init_embs=False)

rel_embedding_layers = []
rel_matrix_layers = []
for i in range(args.clayer):
    rel_embedding_layers.append(
        EmbeddingLayer(n_d=args.word_dim, vocab=rel_lis, fix_init_embs=False))

if args.model == 4:
    for i in range(args.clayer):
        rel_matrix_layers.append(
Example #17
0
def main():
    print 'Starting at: {}\n'.format(datetime.now())
    raw_corpus = myio.read_corpus(args.corpus)
    embedding_layer = create_embedding_layer(
        n_d=200,
        embs=load_embedding_iterator(args.embeddings),
        only_words=False if args.use_embeddings else True,
        trainable=args.trainable
    )
    ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, max_len=args.max_seq_len)
    print("vocab size={}, corpus size={}\n".format(
            embedding_layer.n_V,
            len(raw_corpus)
        ))
    padding_id = embedding_layer.vocab_map["<padding>"]

    if args.reweight:
        weights = myio.create_idf_weights(args.corpus, embedding_layer)

    if args.layer.lower() == "lstm":
        from models import LstmQR as Model
    elif args.layer.lower() in ["bilstm", "bigru"]:
        from models import BiRNNQR as Model
    elif args.layer.lower() == "cnn":
        from models import CnnQR as Model
    elif args.layer.lower() == "gru":
        from models import GruQR as Model
    else:
        raise Exception("no correct layer given")

    if args.dev:
        dev = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1)
        dev = myio.create_eval_batches(ids_corpus, dev, padding_id, pad_left=False)
    if args.test:
        test = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1)
        test = myio.create_eval_batches(ids_corpus, test, padding_id, pad_left=False)

    model = Model(args, embedding_layer, weights=weights if args.reweight else None)
    model.ready()

    print 'total (non) trainable params: ', model.num_parameters()

    if args.load_pre_trained_part:
        # need to remove the old assigns to embeddings
        model.init_assign_ops = model.load_pre_trained_part(args.load_pre_trained_part)
    print '\nmodel init_assign_ops: {}\n'.format(model.init_assign_ops)

    if args.train:
        start_time = time.time()
        train = myio.read_annotations(args.train)
        train_batches = myio.create_batches(
            ids_corpus, train, args.batch_size, padding_id, pad_left=False
        )

        print("{} to create batches\n".format(time.time()-start_time))
        print("{} batches, {} tokens in total, {} triples in total\n".format(
                len(train_batches),
                sum(len(x[0].ravel())+len(x[1].ravel()) for x in train_batches),
                sum(len(x[2].ravel()) for x in train_batches)
            ))

        model.train_model(
            ids_corpus,
            train,
            dev=dev if args.dev else None,
            test=test if args.test else None
        )
    print '\nEnded at: {}'.format(datetime.now())
def main(args):
    raw_corpus = myio.read_corpus(args.corpus, args.translations or None,
                                  args.translatable_ids or None,
                                  args.generated_questions_train or None)

    generated_questions_eval = myio.read_generated_questions(
        args.generated_questions)

    embedding_layer = None
    if args.trainable_embeddings == 1:
        embedding_layer = myio.create_embedding_layer(
            raw_corpus,
            n_d=args.hidden_dim,
            cut_off=args.cut_off,
            embs=load_embedding_iterator(args.embeddings)
            if args.embeddings else None,
            fix_init_embs=False)
    else:
        embedding_layer = myio.create_embedding_layer(
            raw_corpus,
            n_d=args.hidden_dim,
            cut_off=args.cut_off,
            embs=load_embedding_iterator(args.embeddings)
            if args.embeddings else None)
    ids_corpus = myio.map_corpus(raw_corpus,
                                 embedding_layer,
                                 max_len=args.max_seq_len,
                                 generated_questions=generated_questions_eval)
    say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V,
                                                 len(raw_corpus)))
    padding_id = embedding_layer.vocab_map["<padding>"]

    if args.reweight:
        weights = myio.create_idf_weights(args.corpus, embedding_layer)

    if args.dev:
        # dev = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1)
        dev = myio.read_annotations(args.dev,
                                    K_neg=args.dev_pool_size,
                                    prune_pos_cnt=-1)
        dev = myio.create_eval_batches(ids_corpus,
                                       dev,
                                       padding_id,
                                       pad_left=not args.average)
    if args.test:
        test = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1)
        test = myio.create_eval_batches(ids_corpus,
                                        test,
                                        padding_id,
                                        pad_left=not args.average)

    if args.train:
        start_time = time.time()
        train = myio.read_annotations(
            args.train, training_data_percent=args.training_data_percent)
        train_batches = myio.create_batches(ids_corpus,
                                            train,
                                            args.batch_size,
                                            padding_id,
                                            pad_left=not args.average,
                                            include_generated_questions=True)
        say("{} to create batches\n".format(time.time() - start_time))
        say("{} batches, {} tokens in total, {} triples in total\n".format(
            len(train_batches),
            sum(len(x[0].ravel()) + len(x[1].ravel()) for x in train_batches),
            sum(len(x[2].ravel()) for x in train_batches)))
        train_batches = None

        model = Model(args,
                      embedding_layer,
                      weights=weights if args.reweight else None)
        # print('args.average: '+args.average)
        model.ready()

        # # # set parameters using pre-trained network
        if args.do_train == 1:
            if args.load_pretrain:
                model.load_pretrained_parameters(args)

            model.train(ids_corpus, train, dev if args.dev else None,
                        test if args.test else None)

        # AVERAGE THE PREDICTIONS OBTAINED BY RUNNING THE MODEL 10 TIMES
        if args.do_evaluate == 1:
            model.load_pretrained_parameters(args)
            # model.set_model(model.load_model(args.load_pretrain))
            for i in range(1):
                r = model.just_eval(dev if args.dev else None,
                                    test if args.test else None)

        # ANALYZE the results
        if len(args.analyze_file.strip()) > 0:
            model.load_pretrained_parameters(args)
            file_name = args.analyze_file.strip(
            )  # 'AskUbuntu.Rcnn_analysis3.gt(es)-gt.txt'
            model.analyze(file_name, embedding_layer, dev)
Example #19
0
                           default="")  # to write in
    argparser.add_argument("--results_file", type=str,
                           default="")  # to write in

    args = argparser.parse_args()
    print '\n', args, '\n'

    df = read_df(args.df_corpus)
    df = df.fillna(u'')

    label_tags = pickle.load(open(args.tags_file, 'rb'))

    raw_corpus = myio.read_corpus(args.corpus_w_tags, with_tags=True)

    embedding_layer = create_embedding_layer(n_d=10,
                                             embs=load_embedding_iterator(
                                                 args.embeddings),
                                             only_words=False)

    with tf.Session() as sess:

        myqrapi = TPAPI(args.model, embedding_layer, sess, len(label_tags),
                        args.layer)

        embedding_layer = myqrapi.model.embedding_layer

        ids_corpus = myio.map_corpus(raw_corpus,
                                     embedding_layer,
                                     label_tags,
                                     max_len=args.max_seq_len)

        print("vocab size={}, corpus size={}\n".format(embedding_layer.n_V,
Example #20
0
def main(args):
    print(args)

    model = None

    assert args.embedding, "Pre-trained word embeddings required."

    embedding_layer = EmbeddingLayer(
        n_d=args.hidden_dim,
        vocab=["<unk>"],
        embs=load_embedding_iterator(args.embedding),
        fix_init_embs=args.fix_emb
    )

    user_embedding_layer = None

    fix_user_embs = True
    vocab_lis = ["<unk>"]

    if args.user_embs:
        user_embedding_layer = EmbeddingLayer(
            n_d=args.hidden_dim,
            vocab=vocab_lis,
            embs=load_embedding_iterator(args.user_embs),
            fix_init_embs=fix_user_embs,
            norm=args.norm_user_embs
        )
        n_d = user_embedding_layer.n_d
    else:
        user_embedding_layer = EmbeddingLayer(
            n_d=n_d,
            vocab=vocab_lis,
            embs=None,
            fix_init_embs=fix_user_embs,
            norm=args.norm_user_embs
        )

    if args.train:
        train_x, train_y, train_usr = load_doc_corpus(
            embedding_layer, user_embedding_layer, args.train, hieri=args.hierarchical)

    if args.dev:
        dev_x, dev_y, dev_usr = load_doc_corpus(
            embedding_layer, user_embedding_layer, args.dev, hieri=args.hierarchical)

    if args.test:
        test_x, test_y, test_usr = load_doc_corpus(
            embedding_layer, user_embedding_layer, args.test, hieri=args.hierarchical)

    if args.train:
        model = Model(
            args=args,
            embedding_layer=embedding_layer,
            nclasses=max(train_y) + 1,
            user_embedding_layer=user_embedding_layer,
        )
        if args.load:
            print('loading model...')
            model.load_model(args.load)
        else:
            model.ready()

        print('training...')
        model.train(
            (train_x, train_y, train_usr),
            (dev_x, dev_y, dev_usr) if args.dev else None,
            (test_x, test_y, test_usr) if args.test else None,
        )

    if args.load and args.test and not args.train:
        # model.args and model.nclasses will be loaded from file
        model = Model(
            args=None,
            embedding_layer=embedding_layer,
            nclasses=-1
        )
        model.load_model(args.load)
        accuracy = model.evaluate_set(test_x, test_y)
        print(accuracy)