Esempio n. 1
0
def main():
    print args
    assert args.embedding, "Pre-trained word embeddings required."

    embedding_layer = myio.create_embedding_layer(args.embedding)

    max_len = args.max_len

    if args.train:
        train_x, train_y = myio.read_annotations(args.train)
        train_x = [embedding_layer.map_to_ids(x)[:max_len] for x in train_x]

    if args.dev:
        dev_x, dev_y = myio.read_annotations(args.dev)
        dev_x = [embedding_layer.map_to_ids(x)[:max_len] for x in dev_x]

    if args.test:
        test_x, test_y = myio.read_annotations(args.test)
        test_x = [embedding_layer.map_to_ids(x)[:max_len] for x in test_x]

    if args.train:
        model = Model(args=args,
                      embedding_layer=embedding_layer,
                      nclasses=len(train_y[0]))
        model.ready()

        #debug_func2 = theano.function(
        #        inputs = [ model.x, model.z ],
        #        outputs = model.generator.logpz
        #    )
        #theano.printing.debugprint(debug_func2)
        #return

        model.train((train_x, train_y), (dev_x, dev_y) if args.dev else None,
                    (test_x, test_y) if args.test else None)
Esempio n. 2
0
File: api.py Progetto: Sundayxr/rcnn
    def __init__(self, model_path, corpus_path, emb_path):
        raw_corpus = myio.read_corpus(corpus_path)
        embedding_layer = myio.create_embedding_layer(
                    raw_corpus,
                    n_d = 10,
                    cut_off = 1,
                    embs = load_embedding_iterator(emb_path)
                )
        weights = myio.create_idf_weights(corpus_path, embedding_layer)
        say("vocab size={}, corpus size={}\n".format(
                embedding_layer.n_V,
                len(raw_corpus)
            ))

        model = Model(args=None, embedding_layer=embedding_layer,
                    weights=weights)

        model_data = model.load_model(model_path)
        model.set_model(model_data)
        model.dropout.set_value(0.0)
        say("model initialized\n")

        score_func = theano.function(
                inputs = [ model.idts, model.idbs ],
                outputs = model.scores,
                on_unused_input='ignore'
            )
        self.model = model
        self.score_func = score_func
        say("scoring function compiled\n")
Esempio n. 3
0
    def __init__(self, model_path, corpus_path, emb_path):
        raw_corpus = myio.read_corpus(corpus_path)
        embedding_layer = myio.create_embedding_layer(
            raw_corpus,
            n_d=10,
            cut_off=1,
            embs=load_embedding_iterator(emb_path))
        weights = myio.create_idf_weights(corpus_path, embedding_layer)
        say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V,
                                                     len(raw_corpus)))

        model = Model(args=None,
                      embedding_layer=embedding_layer,
                      weights=weights)

        model_data = model.load_model(model_path)
        model.set_model(model_data)
        model.dropout.set_value(0.0)
        say("model initialized\n")

        score_func = theano.function(inputs=[model.idts, model.idbs],
                                     outputs=model.scores,
                                     on_unused_input='ignore')
        self.model = model
        self.score_func = score_func
        say("scoring function compiled\n")
Esempio n. 4
0
def main(args):
    raw_corpus = myio.read_corpus(args.corpus)
    embedding_layer = myio.create_embedding_layer(
        raw_corpus,
        n_d=args.hidden_dim,
        embs=load_embedding_iterator(args.embeddings)
        if args.embeddings else None)
    ids_corpus = myio.map_corpus(raw_corpus,
                                 embedding_layer,
                                 max_len=args.max_seq_len)
    say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V,
                                                 len(raw_corpus)))
    padding_id = embedding_layer.vocab_map["<padding>"]

    if args.reweight:
        weights = myio.create_idf_weights(args.corpus, embedding_layer)

    if args.dev:
        dev_raw = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1)
        dev = myio.create_eval_batches(ids_corpus,
                                       dev_raw,
                                       padding_id,
                                       pad_left=not args.average,
                                       merge=args.merge)
    if args.test:
        test_raw = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1)
        test = myio.create_eval_batches(ids_corpus,
                                        test_raw,
                                        padding_id,
                                        pad_left=not args.average,
                                        merge=args.merge)

    if args.train:
        start_time = time.time()
        train = myio.read_annotations(args.train)
        train_batches = myio.create_batches(ids_corpus,
                                            train,
                                            args.batch_size,
                                            padding_id,
                                            pad_left=not args.average,
                                            merge=args.merge)
        say("{} to create batches\n".format(time.time() - start_time))
        say("{} batches, {} tokens in total, {} triples in total\n".format(
            len(train_batches), sum(len(x[0].ravel()) for x in train_batches),
            sum(len(x[1].ravel()) for x in train_batches)))
        train_batches = None

        model = Model(args,
                      embedding_layer,
                      weights=weights if args.reweight else None)
        model.ready()

        # set parameters using pre-trained network
        if args.load_pretrain:
            model.encoder.load_pretrained_parameters(args)

        model.train(ids_corpus, train, (dev, dev_raw) if args.dev else None,
                    (test, test_raw) if args.test else None)
Esempio n. 5
0
def main(args):
    raw_corpus = myio.read_corpus(args.corpus)
    embedding_layer = myio.create_embedding_layer(
                raw_corpus,
                n_d = args.hidden_dim,
                cut_off = args.cut_off,
                embs = load_embedding_iterator(args.embeddings) if args.embeddings else None
            )
    ids_corpus = myio.map_corpus(raw_corpus, embedding_layer)
    say("vocab size={}, corpus size={}\n".format(
            embedding_layer.n_V,
            len(raw_corpus)
        ))
    padding_id = embedding_layer.vocab_map["<padding>"]
    bos_id = embedding_layer.vocab_map["<s>"]
    eos_id = embedding_layer.vocab_map["</s>"]

    if args.reweight:
        weights = myio.create_idf_weights(args.corpus, embedding_layer)

    if args.dev:
        dev = myio.read_annotations(args.dev, K_neg=20, prune_pos_cnt=-1)
        dev = myio.create_eval_batches(ids_corpus, dev, padding_id)
    if args.test:
        test = myio.read_annotations(args.test, K_neg=20, prune_pos_cnt=-1)
        test = myio.create_eval_batches(ids_corpus, test, padding_id)

    if args.heldout:
        with open(args.heldout) as fin:
            heldout_ids = fin.read().split()
        heldout_corpus = dict((id, ids_corpus[id]) for id in heldout_ids if id in ids_corpus)
        train_corpus = dict((id, ids_corpus[id]) for id in ids_corpus
                                                if id not in heldout_corpus)
        heldout = myio.create_batches(heldout_corpus, [ ], args.batch_size,
                    padding_id, bos_id, eos_id, auto_encode=True)
        heldout = [ myio.create_one_batch(b1, t2, padding_id) for t1, b1, t2 in heldout ]
        say("heldout examples={}\n".format(len(heldout_corpus)))

    if args.train:
        model = Model(args, embedding_layer,
                      weights=weights if args.reweight else None)

        start_time = time.time()
        train = myio.read_annotations(args.train)
        if not args.use_anno: train = [ ]
        train_batches = myio.create_batches(ids_corpus, train, args.batch_size,
                    model.padding_id, model.bos_id, model.eos_id, auto_encode=True)
        say("{} to create batches\n".format(time.time()-start_time))

        model.ready()
        model.train(
                ids_corpus if not args.heldout else train_corpus,
                train,
                dev if args.dev else None,
                test if args.test else None,
                heldout if args.heldout else None
            )
Esempio n. 6
0
def main(args):
    raw_corpus = myio.read_corpus(args.corpus)
    print("raw corpus:", args.corpus, "len:", len(raw_corpus))
    embedding_layer = myio.create_embedding_layer(
                raw_corpus,
                n_d = args.hidden_dim,
                cut_off = args.cut_off,
                embs = None # embs = load_embedding_iterator(args.embeddings) if args.embeddings else None
            )
    ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, max_len=args.max_seq_len)
    myio.say("vocab size={}, corpus size={}\n".format(
            embedding_layer.n_V,
            len(raw_corpus)
        ))
    padding_id = embedding_layer.vocab_map["<padding>"]
 
    if args.reweight:
        weights = myio.create_idf_weights(args.corpus, embedding_layer)

# 
#     if args.dev:
#         dev = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1)
#         dev = myio.create_eval_batches(ids_corpus, dev, padding_id, pad_left = not args.average)
#     if args.test:
#         test = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1)
#         test = myio.create_eval_batches(ids_corpus, test, padding_id, pad_left = not args.average)
 
    if args.train:
        start_time = time.time()
        train = myio.read_annotations(args.train)
        print("training data:", args.train, "len:", len(train))
        train_batches = myio.create_batches(ids_corpus, train, args.batch_size,
                                padding_id, pad_left = not args.average)
        myio.say("{:.2f} secs to create {} batches of size {}\n".format( (time.time()-start_time), len(train_batches), args.batch_size))
        myio.say("{} batches, {} tokens in total, {} triples in total\n".format(
                len(train_batches),
                sum(len(x[0].ravel())+len(x[1].ravel()) for x in train_batches),
                sum(len(x[2].ravel()) for x in train_batches)
            ))
#         train_batches = None
 
        model = Model(args, embedding_layer,
                      weights=weights if args.reweight else None)
        model.ready()
 
#         # set parameters using pre-trained network
#         if args.load_pretrain:
#             model.load_pretrained_parameters(args)
# 
        model.train(
                ids_corpus,
                train,
                dev = None, # dev if args.dev else None,
                test = None # test if args.test else None
            )
Esempio n. 7
0
def main(args):
    raw_corpus = myio.read_corpus(args.corpus)
    embedding_layer = myio.create_embedding_layer(
                raw_corpus,
                n_d = args.hidden_dim,
                embs = load_embedding_iterator(args.embeddings) if args.embeddings else None
            )
    ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, max_len=args.max_seq_len)
    say("vocab size={}, corpus size={}\n".format(
            embedding_layer.n_V,
            len(raw_corpus)
        ))
    padding_id = embedding_layer.vocab_map["<padding>"]

    if args.reweight:
        weights = myio.create_idf_weights(args.corpus, embedding_layer)

    if args.dev:
        dev_raw = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1)
        dev = myio.create_eval_batches(ids_corpus, dev_raw, padding_id,
                    pad_left=not args.average, merge=args.merge)
    if args.test:
        test_raw = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1)
        test = myio.create_eval_batches(ids_corpus, test_raw, padding_id,
                    pad_left=not args.average, merge=args.merge)

    if args.train:
        start_time = time.time()
        train = myio.read_annotations(args.train)
        train_batches = myio.create_batches(ids_corpus, train, args.batch_size,
                                padding_id, pad_left = not args.average, merge=args.merge)
        say("{} to create batches\n".format(time.time()-start_time))
        say("{} batches, {} tokens in total, {} triples in total\n".format(
                len(train_batches),
                sum(len(x[0].ravel()) for x in train_batches),
                sum(len(x[1].ravel()) for x in train_batches)
            ))
        train_batches = None

        model = Model(args, embedding_layer,
                      weights=weights if args.reweight else None)
        model.ready()

        # set parameters using pre-trained network
        if args.load_pretrain:
            model.encoder.load_pretrained_parameters(args)

        model.train(
                ids_corpus,
                train,
                (dev, dev_raw) if args.dev else None,
                (test, test_raw) if args.test else None
            )
Esempio n. 8
0
def main():
    print args

    embedding_layer = None
    if args.embedding:
        assert args.embedding, "Pre-trained word embeddings required."

        embedding_layer = myio.create_embedding_layer(args.embedding)

    max_len = args.max_len

    if args.train:
        train_x, train_y = myio.read_annotations(args.train)
        train_words = set([word for x in train_x for word in x])
        embedding_layer = EmbeddingLayer(n_d=args.hidden_dimension,
                                         vocab=["<unk>", "<padding>"] +
                                         list(train_words),
                                         oov="<unk>",
                                         fix_init_embs=False)
        train_x = [embedding_layer.map_to_ids(x)[:max_len] for x in train_x]

    if args.dev:
        dev_x, dev_y = myio.read_annotations(args.dev)
        dev_x = [embedding_layer.map_to_ids(x)[:max_len] for x in dev_x]

    if args.load_rationale:
        rationale_data = myio.read_rationales(args.load_rationale)
        for x in rationale_data:
            x["xids"] = embedding_layer.map_to_ids(x["x"])

    if args.train:
        model = Model(args=args,
                      embedding_layer=embedding_layer,
                      nclasses=len(train_y[0]))
        model.ready()

        #debug_func2 = theano.function(
        #        inputs = [ model.x, model.z ],
        #        outputs = model.generator.logpz
        #    )
        #theano.printing.debugprint(debug_func2)
        #return

        model.train(
            (train_x, train_y),
            (dev_x, dev_y) if args.dev else None,
            None,  #(test_x, test_y),
            rationale_data if args.load_rationale else None)
Esempio n. 9
0
def main():
    print args
    assert args.embedding, "Pre-trained word embeddings required."

    embedding_layer = myio.create_embedding_layer(
                        args.embedding
                    )

    max_len = args.max_len

    if args.train:
        train_x, train_y = myio.read_annotations(args.train)
        train_x = [ embedding_layer.map_to_ids(x)[:max_len] for x in train_x ]

    if args.dev:
        dev_x, dev_y = myio.read_annotations(args.dev)
        dev_x = [ embedding_layer.map_to_ids(x)[:max_len] for x in dev_x ]

    if args.load_rationale:
        rationale_data = myio.read_rationales(args.load_rationale)
        for x in rationale_data:
            x["xids"] = embedding_layer.map_to_ids(x["x"])

    if args.train:
        model = Model(
                    args = args,
                    embedding_layer = embedding_layer,
                    nclasses = len(train_y[0])
                )
        model.ready()

        #debug_func2 = theano.function(
        #        inputs = [ model.x, model.z ],
        #        outputs = model.generator.logpz
        #    )
        #theano.printing.debugprint(debug_func2)
        #return

        model.train(
                (train_x, train_y),
                (dev_x, dev_y) if args.dev else None,
                None, #(test_x, test_y),
                rationale_data if args.load_rationale else None
            )
Esempio n. 10
0
def main():
    assert args.embedding, "Pre-trained word embeddings required."

    vocab = myio.get_vocab(args)
    embedding_layer = myio.create_embedding_layer(args, args.embedding, vocab,
                                                  args.embedding_dim, '<unk>')
    position_emb_layer = myio.create_posit_embedding_layer(args.inp_len, 30)

    n_classes = args.nclasses

    model = Model(args=args,
                  embedding_layer=embedding_layer,
                  embedding_layer_posit=position_emb_layer,
                  nclasses=n_classes)

    if args.train:

        if args.pretrain:
            model.ready_pretrain()
            model.pretrain()
        else:
            if args.load_model_pretrain:
                model.load_model_pretrain(args.save_model + 'pretrain/' +
                                          args.load_model,
                                          inference=False)
            else:
                model.ready()

            model.train()

    elif args.dev:
        model.load_model(args.save_model + args.load_model)
        model.dev_full()

    elif args.test:
        model.load_model(args.save_model + args.load_model, True)
        model.test()
Esempio n. 11
0
def main():
    print args
    assert args.embedding, "Pre-trained word embeddings required."

    embedding_layer = myio.create_embedding_layer(
                        args.embedding
                    )

    max_len = args.max_len

    if args.train:
        train_x, train_y = myio.read_annotations(args.train)
        train_x = [ embedding_layer.map_to_ids(x)[:max_len] for x in train_x ]

    if args.dev:
        dev_x, dev_y = myio.read_annotations(args.dev)
        dev_x = [ embedding_layer.map_to_ids(x)[:max_len] for x in dev_x ]

    if args.load_rationale:
        rationale_data = myio.read_rationales(args.load_rationale)
        for x in rationale_data:
            x["xids"] = embedding_layer.map_to_ids(x["x"])

    if args.train:
        model = Model(
                    args = args,
                    embedding_layer = embedding_layer,
                    nclasses = len(train_y[0])
                )
        model.ready()

        #debug_func2 = theano.function(
        #        inputs = [ model.x, model.z ],
        #        outputs = model.generator.logpz
        #    )
        #theano.printing.debugprint(debug_func2)
        #return

        model.train(
                (train_x, train_y),
                (dev_x, dev_y) if args.dev else None,
                None, #(test_x, test_y),
                rationale_data if args.load_rationale else None
            )

    if args.load_model and args.dev and not args.train:
        model = Model(
                    args = None,
                    embedding_layer = embedding_layer,
                    nclasses = -1
                )
        model.load_model(args.load_model)
        say("model loaded successfully.\n")

        # compile an evaluation function
        eval_func = theano.function(
                inputs = [ model.x, model.y ],
                outputs = [ model.z, model.encoder.obj, model.encoder.loss,
                                model.encoder.pred_diff ],
                updates = model.generator.sample_updates
            )

        # compile a predictor function
        pred_func = theano.function(
                inputs = [ model.x ],
                outputs = [ model.z, model.encoder.preds ],
                updates = model.generator.sample_updates
            )

        # batching data
        padding_id = embedding_layer.vocab_map["<padding>"]
        dev_batches_x, dev_batches_y = myio.create_batches(
                        dev_x, dev_y, args.batch, padding_id
                    )

        # disable dropout
        model.dropout.set_value(0.0)
        dev_obj, dev_loss, dev_diff, dev_p1 = model.evaluate_data(
                dev_batches_x, dev_batches_y, eval_func, sampling=True)
        say("{} {} {} {}\n".format(dev_obj, dev_loss, dev_diff, dev_p1))
Esempio n. 12
0
def main():
    print args
    set_default_rng_seed(args.seed)
    assert args.embedding, "Pre-trained word embeddings required."

    embedding_layer = myio.create_embedding_layer(args.embedding)

    max_len = args.max_len

    if args.train:
        train_x, train_y = myio.read_annotations(args.train)
        if args.debug:
            len_ = len(train_x) * args.debug
            len_ = int(len_)
            train_x = train_x[:len_]
            train_y = train_y[:len_]
        print 'train size: ', len(train_x)  #, train_x[0], len(train_x[0])
        #exit()
        train_x = [embedding_layer.map_to_ids(x)[:max_len] for x in train_x]

    if args.dev:
        dev_x, dev_y = myio.read_annotations(args.dev)
        if args.debug:
            len_ = len(dev_x) * args.debug
            len_ = int(len_)
            dev_x = dev_x[:len_]
            dev_x = dev_y[:len_]
        print 'train size: ', len(train_x)
        dev_x = [embedding_layer.map_to_ids(x)[:max_len] for x in dev_x]

    if args.load_rationale:
        rationale_data = myio.read_rationales(args.load_rationale)
        for x in rationale_data:
            x["xids"] = embedding_layer.map_to_ids(x["x"])

    #print 'in main: ', args.seed
    if args.train:
        model = Model(args=args,
                      embedding_layer=embedding_layer,
                      nclasses=len(train_y[0]))
        if args.load_model:
            model.load_model(args.load_model,
                             seed=args.seed,
                             select_all=args.select_all)
            say("model loaded successfully.\n")
        else:
            model.ready()
        #say(" ready time nedded {} \n".format(time.time()-start_ready_time))

        #debug_func2 = theano.function(
        #        inputs = [ model.x, model.z ],
        #        outputs = model.generator.logpz
        #    )
        #theano.printing.debugprint(debug_func2)
        #return

        model.train(
            (train_x, train_y),
            (dev_x, dev_y) if args.dev else None,
            None,  #(test_x, test_y),
            rationale_data if args.load_rationale else None,
            trained_max_epochs=args.trained_max_epochs)

    if args.load_model and not args.dev and not args.train:
        model = Model(args=args, embedding_layer=embedding_layer, nclasses=-1)
        model.load_model(args.load_model,
                         seed=args.seed,
                         select_all=args.select_all)
        say("model loaded successfully.\n")

        sample_generator = theano.function(
            inputs=[model.x],
            outputs=model.z,
            #updates = model.generator.sample_updates
        )
        sample_encoder = theano.function(
            inputs=[model.x, model.y, model.z],
            outputs=[
                model.encoder.obj, model.encoder.loss, model.encoder.pred_diff
            ],
            #updates = model.generator.sample_updates
        )
        # compile an evaluation function
        eval_func = theano.function(
            inputs=[model.x, model.y],
            outputs=[
                model.z, model.encoder.obj, model.encoder.loss,
                model.encoder.pred_diff
            ],
            #updates = model.generator.sample_updates
        )
        debug_func_enc = theano.function(
            inputs=[model.x, model.y],
            outputs=[
                model.z, model.encoder.obj, model.encoder.loss,
                model.encoder.pred_diff
            ],
            #updates = model.generator.sample_updates
        )
        debug_func_gen = theano.function(
            inputs=[model.x, model.y],
            outputs=[
                model.z, model.encoder.obj, model.encoder.loss,
                model.encoder.pred_diff
            ],
            #updates = model.generator.sample_updates
        )

        # compile a predictor function
        pred_func = theano.function(
            inputs=[model.x],
            outputs=[model.z, model.encoder.preds],
            #updates = model.generator.sample_updates
        )

        # batching data
        padding_id = embedding_layer.vocab_map["<padding>"]
        if rationale_data is not None:
            valid_batches_x, valid_batches_y = myio.create_batches(
                [u["xids"] for u in rationale_data],
                [u["y"] for u in rationale_data],
                args.batch,
                padding_id,
                sort=False)

        # disable dropout
        model.dropout.set_value(0.0)
        if rationale_data is not None:
            #model.dropout.set_value(0.0)
            start_rational_time = time.time()
            r_mse, r_p1, r_prec1, r_prec2, gen_time, enc_time, prec_cal_time = model.evaluate_rationale(
                rationale_data, valid_batches_x, valid_batches_y,
                sample_generator, sample_encoder, eval_func)
            #valid_batches_y, eval_func)

            #model.dropout.set_value(dropout_prob)
            #say(("\ttest rationale mser={:.4f}  p[1]r={:.2f}  prec1={:.4f}" +
            #            "  prec2={:.4f} generator time={:.4f} encoder time={:.4f} total test time={:.4f}\n").format(
            #        r_mse,
            #        r_p1,
            #        r_prec1,
            #        r_prec2,
            #        gen_time,
            #        enc_time,
            #        time.time() - start_rational_time
            #))

            data = str('%.5f' % r_mse) + "\t" + str(
                '%4.2f' % r_p1) + "\t" + str('%4.4f' % r_prec1) + "\t" + str(
                    '%4.4f' %
                    r_prec2) + "\t" + str('%4.2f' % gen_time) + "\t" + str(
                        '%4.2f' % enc_time) + "\t" + str(
                            '%4.2f' % prec_cal_time) + "\t" + str(
                                '%4.2f' % (time.time() - start_rational_time)
                            ) + "\t" + str(args.sparsity) + "\t" + str(
                                args.coherent) + "\t" + str(
                                    args.max_epochs) + "\t" + str(
                                        args.cur_epoch)

            with open(args.graph_data_path, 'a') as g_f:
                print 'writning to file: ', data
                g_f.write(data + "\n")
Esempio n. 13
0
def main():
    print(args)
    assert args.embedding, "Pre-trained word embeddings required."

    embedding_layer = myio.create_embedding_layer(args.embedding)

    max_len = args.max_len

    if args.train:
        train_x, train_y = myio.read_annotations(args.train)
        train_x = [embedding_layer.map_to_ids(x)[:max_len] for x in train_x]

    if args.dev:
        dev_x, dev_y = myio.read_annotations(args.dev)
        dev_x = [embedding_layer.map_to_ids(x)[:max_len] for x in dev_x]

    if args.load_rationale:
        rationale_data = myio.read_rationales(args.load_rationale)
        for x in rationale_data:
            x["xids"] = embedding_layer.map_to_ids(x["x"])

    if args.train:
        model = Model(args=args,
                      embedding_layer=embedding_layer,
                      nclasses=len(train_y[0]))
        model.ready()

        model.train(
            (train_x, train_y),
            (dev_x, dev_y) if args.dev else None,
            None,  #(test_x, test_y),
            rationale_data if args.load_rationale else None)

    if args.load_model and args.dev and not args.train:
        model = Model(args=None, embedding_layer=embedding_layer, nclasses=-1)
        model.load_model(args.load_model)
        say("model loaded successfully.\n")

        # compile an evaluation function
        eval_func = theano.function(
            inputs=[model.x, model.y],
            outputs=[
                model.z, model.generator.obj, model.generator.loss,
                model.encoder.pred_diff
            ],
            givens={model.z: model.generator.z_pred},
        )

        # compile a predictor function
        pred_func = theano.function(
            inputs=[model.x],
            outputs=[model.z, model.encoder.preds],
            givens={model.z: model.generator.z_pred},
        )

        # batching data
        padding_id = embedding_layer.vocab_map["<padding>"]
        dev_batches_x, dev_batches_y = myio.create_batches(
            dev_x, dev_y, args.batch, padding_id)

        # disable dropout
        model.dropout.set_value(0.0)
        dev_obj, dev_loss, dev_diff, dev_p1 = model.evaluate_data(
            dev_batches_x, dev_batches_y, eval_func, sampling=True)
        say("{} {} {} {}\n".format(dev_obj, dev_loss, dev_diff, dev_p1))
Esempio n. 14
0
def main(args):
    raw_corpus = myio.read_corpus(args.corpus)
    embedding_layer = myio.create_embedding_layer(
        raw_corpus,
        n_d=args.hidden_dim,
        cut_off=args.cut_off,
        embs=load_embedding_iterator(args.embeddings)
        if args.embeddings else None)
    ids_corpus = myio.map_corpus(raw_corpus, embedding_layer)
    say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V,
                                                 len(raw_corpus)))
    padding_id = embedding_layer.vocab_map["<padding>"]
    bos_id = embedding_layer.vocab_map["<s>"]
    eos_id = embedding_layer.vocab_map["</s>"]

    if args.reweight:
        weights = myio.create_idf_weights(args.corpus, embedding_layer)

    if args.dev:
        dev = myio.read_annotations(args.dev, K_neg=20, prune_pos_cnt=-1)
        dev = myio.create_eval_batches(ids_corpus, dev, padding_id)
    if args.test:
        test = myio.read_annotations(args.test, K_neg=20, prune_pos_cnt=-1)
        test = myio.create_eval_batches(ids_corpus, test, padding_id)

    if args.heldout:
        with open(args.heldout) as fin:
            heldout_ids = fin.read().split()
        heldout_corpus = dict(
            (id, ids_corpus[id]) for id in heldout_ids if id in ids_corpus)
        train_corpus = dict((id, ids_corpus[id]) for id in ids_corpus
                            if id not in heldout_corpus)
        heldout = myio.create_batches(heldout_corpus, [],
                                      args.batch_size,
                                      padding_id,
                                      bos_id,
                                      eos_id,
                                      auto_encode=True)
        heldout = [
            myio.create_one_batch(b1, t2, padding_id) for t1, b1, t2 in heldout
        ]
        say("heldout examples={}\n".format(len(heldout_corpus)))

    if args.train:
        model = Model(args,
                      embedding_layer,
                      weights=weights if args.reweight else None)

        start_time = time.time()
        train = myio.read_annotations(args.train)
        if not args.use_anno: train = []
        train_batches = myio.create_batches(ids_corpus,
                                            train,
                                            args.batch_size,
                                            model.padding_id,
                                            model.bos_id,
                                            model.eos_id,
                                            auto_encode=True)
        say("{} to create batches\n".format(time.time() - start_time))
        model.ready()

        model.train(ids_corpus if not args.heldout else train_corpus, train,
                    dev if args.dev else None, test if args.test else None,
                    heldout if args.heldout else None)
Esempio n. 15
0
def main():
    assert args.embedding, "Pre-trained word embeddings required."

    embedding_layer = myio.create_embedding_layer(args.embedding)
    embedding_layer_y = myio.create_embedding_layer(args.embedding)

    max_len_x = args.sentence_length * args.max_sentences
    max_len_y = args.sentence_length_hl * args.max_sentences_hl

    if args.train:
        train_x, train_y = myio.read_docs(args.train)
        train_x = [embedding_layer.map_to_ids(x)[:max_len_x] for x in train_x]
        train_y = [
            embedding_layer_y.map_to_ids(y)[:max_len_y] for y in train_y
        ]

    if args.dev:
        dev_x, dev_y = myio.read_docs(args.dev)
        dev_x = [embedding_layer.map_to_ids(x)[:max_len_x] for x in dev_x]
        dev_y = [embedding_layer_y.map_to_ids(y)[:max_len_y] for y in dev_y]

    if args.load_rationale:
        rationale_data = myio.read_rationales(args.load_rationale)
        for x in rationale_data:
            x["xids"] = embedding_layer.map_to_ids(x["x"])

    if args.train:
        model = Model(args=args,
                      embedding_layer=embedding_layer,
                      embedding_layer_y=embedding_layer_y,
                      nclasses=len(train_y[0]))
        model.ready()

        # debug_func2 = theano.function(
        #        inputs = [ model.x, model.z ],
        #        outputs = model.generator.logpz
        #    )
        # theano.printing.debugprint(debug_func2)
        # return

        model.train(
            (train_x, train_y),
            (dev_x, dev_y) if args.dev else None,
            None,  # (test_x, test_y),
            rationale_data if args.load_rationale else None)

    if args.load_model and args.dev and not args.train:
        model = Model(args=None, embedding_layer=embedding_layer, nclasses=-1)
        model.load_model(args.load_model)
        say("model loaded successfully.\n")

        # compile an evaluation function
        eval_func = theano.function(inputs=[model.x, model.y],
                                    outputs=[
                                        model.z, model.encoder.obj,
                                        model.encoder.loss,
                                        model.encoder.pred_diff
                                    ],
                                    updates=model.generator.sample_updates)

        # compile a predictor function
        pred_func = theano.function(inputs=[model.x],
                                    outputs=[model.z, model.encoder.preds],
                                    updates=model.generator.sample_updates)

        # batching data
        padding_id = embedding_layer.vocab_map["<padding>"]
        dev_batches_x, dev_batches_y = myio.create_batches(
            dev_x, dev_y, args.batch, padding_id)

        # disable dropout
        model.dropout.set_value(0.0)
        dev_obj, dev_loss, dev_diff, dev_p1 = model.evaluate_data(
            dev_batches_x, dev_batches_y, eval_func, sampling=True)
        say("{} {} {} {}\n".format(dev_obj, dev_loss, dev_diff, dev_p1))
Esempio n. 16
0
def main():
    assert args.embedding, "Pre-trained word embeddings required."

    vocab = myio.get_vocab(args)
    embedding_layer = myio.create_embedding_layer(args, args.embedding, vocab)

    n_classes = args.nclasses

    model = Model(args=args,
                  embedding_layer=embedding_layer,
                  nclasses=n_classes)

    if args.dev_baseline:
        num_files = args.num_files_dev

        rx_ls = []
        bm_ls = []

        for i in xrange(num_files):
            batches_x, _, _, batches_bm, batches_sha, batches_rx = myio.load_batches(
                args.batch_dir + args.source + 'dev', i)

            cur_len = len(batches_x)

            for j in xrange(cur_len):
                _, bm, _, rx = batches_x[j], batches_bm[j], batches_sha[
                    j], batches_rx[j]
                rx_ls.append(rx)
                bm_ls.append(bm)

        myio.eval_baseline(args, bm_ls, rx_ls, 'dev')
    elif args.test_baseline:
        num_files = args.num_files_test

        rx_ls = []
        bm_ls = []

        for i in xrange(num_files):
            batches_x, batches_bm, batches_sha, batches_rx = myio.load_batches(
                args.batch_dir + args.source + 'test', i)

            cur_len = len(batches_x)

            for j in xrange(cur_len):
                _, bm, _, rx = batches_x[j], batches_bm[j], batches_sha[
                    j], batches_rx[j]
                rx_ls.append(rx)
                bm_ls.append(bm)

        myio.eval_baseline(args, bm_ls, rx_ls, 'test')

    elif args.train:

        if args.pretrain:
            model.ready_pretrain()
            model.pretrain()
        else:
            if args.load_model_pretrain:
                model.load_model_pretrain(args.save_model + 'pretrain/' +
                                          args.load_model)
            else:
                model.ready()

            model.train()

    elif args.dev:
        if args.pretrain:
            model.load_model_pretrain(args.save_model + 'pretrain/' +
                                      args.load_model)
            model.dev()
        else:
            model.load_model(args.save_model + args.load_model)
            model.dev_full()

    elif args.test:
        model.load_model(args.save_model + args.load_model, True)
        model.test()
def main(args):
    raw_corpus = myio.read_corpus(args.corpus, args.translations or None,
                                  args.translatable_ids or None,
                                  args.generated_questions_train or None)

    generated_questions_eval = myio.read_generated_questions(
        args.generated_questions)

    embedding_layer = None
    if args.trainable_embeddings == 1:
        embedding_layer = myio.create_embedding_layer(
            raw_corpus,
            n_d=args.hidden_dim,
            cut_off=args.cut_off,
            embs=load_embedding_iterator(args.embeddings)
            if args.embeddings else None,
            fix_init_embs=False)
    else:
        embedding_layer = myio.create_embedding_layer(
            raw_corpus,
            n_d=args.hidden_dim,
            cut_off=args.cut_off,
            embs=load_embedding_iterator(args.embeddings)
            if args.embeddings else None)
    ids_corpus = myio.map_corpus(raw_corpus,
                                 embedding_layer,
                                 max_len=args.max_seq_len,
                                 generated_questions=generated_questions_eval)
    say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V,
                                                 len(raw_corpus)))
    padding_id = embedding_layer.vocab_map["<padding>"]

    if args.reweight:
        weights = myio.create_idf_weights(args.corpus, embedding_layer)

    if args.dev:
        # dev = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1)
        dev = myio.read_annotations(args.dev,
                                    K_neg=args.dev_pool_size,
                                    prune_pos_cnt=-1)
        dev = myio.create_eval_batches(ids_corpus,
                                       dev,
                                       padding_id,
                                       pad_left=not args.average)
    if args.test:
        test = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1)
        test = myio.create_eval_batches(ids_corpus,
                                        test,
                                        padding_id,
                                        pad_left=not args.average)

    if args.train:
        start_time = time.time()
        train = myio.read_annotations(
            args.train, training_data_percent=args.training_data_percent)
        train_batches = myio.create_batches(ids_corpus,
                                            train,
                                            args.batch_size,
                                            padding_id,
                                            pad_left=not args.average,
                                            include_generated_questions=True)
        say("{} to create batches\n".format(time.time() - start_time))
        say("{} batches, {} tokens in total, {} triples in total\n".format(
            len(train_batches),
            sum(len(x[0].ravel()) + len(x[1].ravel()) for x in train_batches),
            sum(len(x[2].ravel()) for x in train_batches)))
        train_batches = None

        model = Model(args,
                      embedding_layer,
                      weights=weights if args.reweight else None)
        # print('args.average: '+args.average)
        model.ready()

        # # # set parameters using pre-trained network
        if args.do_train == 1:
            if args.load_pretrain:
                model.load_pretrained_parameters(args)

            model.train(ids_corpus, train, dev if args.dev else None,
                        test if args.test else None)

        # AVERAGE THE PREDICTIONS OBTAINED BY RUNNING THE MODEL 10 TIMES
        if args.do_evaluate == 1:
            model.load_pretrained_parameters(args)
            # model.set_model(model.load_model(args.load_pretrain))
            for i in range(1):
                r = model.just_eval(dev if args.dev else None,
                                    test if args.test else None)

        # ANALYZE the results
        if len(args.analyze_file.strip()) > 0:
            model.load_pretrained_parameters(args)
            file_name = args.analyze_file.strip(
            )  # 'AskUbuntu.Rcnn_analysis3.gt(es)-gt.txt'
            model.analyze(file_name, embedding_layer, dev)