Exemple #1
0
def main():
    print 'Starting at: {}\n'.format(datetime.now())
    s_time = time.time()
    df = read_df(args.df_path)
    df = df.fillna(u'')

    label_tags = pickle.load(open(args.tags_file, 'rb'))
    print '\nloaded {} tags'.format(len(label_tags))

    raw_corpus = myio.read_corpus(args.corpus_w_tags, with_tags=True)

    embedding_layer = create_embedding_layer(
        n_d=200,
        embs=load_embedding_iterator(args.embeddings),
        only_words=False if args.use_embeddings else True,
        # only_words will take the words from embedding file and make random initial embeddings
        trainable=args.trainable
    )

    ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, label_tags, max_len=args.max_seq_len)

    print("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus)))

    padding_id = embedding_layer.vocab_map["<padding>"]

    if args.reweight:
        weights = myio.create_idf_weights(args.corpus_w_tags, embedding_layer, with_tags=True)

    if args.layer.lower() == "lstm":
        from models import LstmMultiTagsClassifier as Model
    elif args.layer.lower() in ["bilstm", "bigru"]:
        from models import BiRNNMultiTagsClassifier as Model
    elif args.layer.lower() == "cnn":
        from models import CnnMultiTagsClassifier as Model
    elif args.layer.lower() == "gru":
        from models import GruMultiTagsClassifier as Model
    else:
        raise Exception("no correct layer given")

    if args.cross_val:
        train, dev, test = myio.create_cross_val_batches(df, ids_corpus, args.batch_size, padding_id)
    else:
        dev = list(myio.create_batches(
            df, ids_corpus, 'dev', args.batch_size, padding_id, N_neg=args.n_neg, samples_file=args.samples_file))
        test = list(myio.create_batches(
            df, ids_corpus, 'test', args.batch_size, padding_id, N_neg=args.n_neg, samples_file=args.samples_file))
    # baselines_eval(train, dev, test)

    model = Model(args, embedding_layer, len(label_tags), weights=weights if args.reweight else None)
    model.ready()

    print 'total (non) trainable params: ', model.num_parameters()

    if args.load_pre_trained_part:
        # need to remove the old assigns to embeddings
        model.init_assign_ops = model.load_pre_trained_part(args.load_pre_trained_part)
    print '\nmodel init_assign_ops: {}\n'.format(model.init_assign_ops)

    model.train_model(df, ids_corpus, dev=dev, test=test)
    print '\nEnded at: {}'.format(datetime.now())
Exemple #2
0
    def __init__(self,
                 model_path,
                 corpus_path,
                 emb_path,
                 session,
                 layer='lstm'):
        raw_corpus = myio.read_corpus(corpus_path)
        embedding_layer = create_embedding_layer(n_d=10,
                                                 embs=load_embedding_iterator(
                                                     args.embeddings),
                                                 only_words=False)
        # weights = myio.create_idf_weights(corpus_path, embedding_layer) # todo
        say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V,
                                                     len(raw_corpus)))

        if layer.lower() == "lstm":
            from models import LstmQR as Model
        elif layer.lower() in ["bilstm", "bigru"]:
            from models import BiRNNQR as Model
        elif layer.lower() == "cnn":
            from models import CnnQR as Model
        elif layer.lower() == "gru":
            from models import GruQR as Model

        model = Model(args={"layer": args.layer},
                      embedding_layer=embedding_layer,
                      weights=None)

        model.load_n_set_model(model_path, session)
        say("model initialized\n")

        self.model = model

        def score_func(titles, bodies, cur_sess):
            feed_dict = {
                self.model.titles_words_ids_placeholder:
                titles.T,  # IT IS TRANSPOSE ;)
                self.model.bodies_words_ids_placeholder:
                bodies.T,  # IT IS TRANSPOSE ;)
                self.model.dropout_prob: 0.,
            }
            _scores = cur_sess.run(self.model.scores, feed_dict)
            return _scores

        self.score_func = score_func
        say("scoring function compiled\n")
Exemple #3
0
def main():
    print 'Starting at: {}\n'.format(datetime.now())
    raw_corpus = qaio.read_corpus(args.corpus)
    embedding_layer = create_embedding_layer(
        n_d=200,
        embs=load_embedding_iterator(args.embeddings),
        only_words=False if args.use_embeddings else True,
        trainable=args.trainable)
    print("vocab size={}, corpus size={}\n".format(embedding_layer.n_V,
                                                   len(raw_corpus)))
    if args.reweight:
        weights = qaio.create_idf_weights(args.corpus, embedding_layer)

    label_tags = pickle.load(open(args.tags_file, 'rb'))
    print '\nloaded {} tags'.format(len(label_tags))

    raw_corpus_tags = tpio.read_corpus(args.corpus_w_tags, with_tags=True)
    ids_corpus_tags = tpio.map_corpus(raw_corpus_tags,
                                      embedding_layer,
                                      label_tags,
                                      max_len=args.max_seq_len)

    padding_id = embedding_layer.vocab_map["<padding>"]

    if args.dev:
        dev = qaio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1)
        dev = myio.create_eval_batches(ids_corpus_tags,
                                       dev,
                                       padding_id,
                                       N_neg=args.n_neg,
                                       samples_file=args.samples_file)

    if args.test:
        test = qaio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1)
        test = myio.create_eval_batches(ids_corpus_tags,
                                        test,
                                        padding_id,
                                        N_neg=args.n_neg,
                                        samples_file=args.samples_file)

    if args.train:
        train = qaio.read_annotations(args.train)

        if args.layer.lower() == "lstm":
            from models import LstmQRTP as Model
        elif args.layer.lower() in ["bilstm", "bigru"]:
            from models import BiRNNQRTP as Model
        elif args.layer.lower() == "cnn":
            from models import CnnQRTP as Model
        elif args.layer.lower() == "gru":
            from models import GruQRTP as Model
        else:
            raise Exception("no correct layer given")

        model = Model(args,
                      embedding_layer,
                      len(label_tags),
                      weights=weights if args.reweight else None)
        model.ready()
        print 'total (non) trainable params: ', model.num_parameters()

        if args.load_pre_trained_part:
            # need to remove the old assigns to embeddings
            model.init_assign_ops = model.load_pre_trained_part(
                args.load_pre_trained_part)
        print '\nmodel init_assign_ops: {}\n'.format(model.init_assign_ops)

        model.train_model(ids_corpus_tags,
                          train,
                          dev=dev if args.dev else None,
                          test=test if args.test else None)
    print '\nEnded at: {}'.format(datetime.now())
Exemple #4
0
def main():
    print 'Starting at: {}\n'.format(datetime.now())
    raw_corpus = myio.read_corpus(args.corpus)
    embedding_layer = create_embedding_layer(
        n_d=200,
        embs=load_embedding_iterator(args.embeddings),
        only_words=False if args.use_embeddings else True,
        trainable=args.trainable
    )
    ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, max_len=args.max_seq_len)
    print("vocab size={}, corpus size={}\n".format(
            embedding_layer.n_V,
            len(raw_corpus)
        ))
    padding_id = embedding_layer.vocab_map["<padding>"]

    if args.reweight:
        weights = myio.create_idf_weights(args.corpus, embedding_layer)

    if args.layer.lower() == "lstm":
        from models import LstmQR as Model
    elif args.layer.lower() in ["bilstm", "bigru"]:
        from models import BiRNNQR as Model
    elif args.layer.lower() == "cnn":
        from models import CnnQR as Model
    elif args.layer.lower() == "gru":
        from models import GruQR as Model
    else:
        raise Exception("no correct layer given")

    if args.dev:
        dev = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1)
        dev = myio.create_eval_batches(ids_corpus, dev, padding_id, pad_left=False)
    if args.test:
        test = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1)
        test = myio.create_eval_batches(ids_corpus, test, padding_id, pad_left=False)

    model = Model(args, embedding_layer, weights=weights if args.reweight else None)
    model.ready()

    print 'total (non) trainable params: ', model.num_parameters()

    if args.load_pre_trained_part:
        # need to remove the old assigns to embeddings
        model.init_assign_ops = model.load_pre_trained_part(args.load_pre_trained_part)
    print '\nmodel init_assign_ops: {}\n'.format(model.init_assign_ops)

    if args.train:
        start_time = time.time()
        train = myio.read_annotations(args.train)
        train_batches = myio.create_batches(
            ids_corpus, train, args.batch_size, padding_id, pad_left=False
        )

        print("{} to create batches\n".format(time.time()-start_time))
        print("{} batches, {} tokens in total, {} triples in total\n".format(
                len(train_batches),
                sum(len(x[0].ravel())+len(x[1].ravel()) for x in train_batches),
                sum(len(x[2].ravel()) for x in train_batches)
            ))

        model.train_model(
            ids_corpus,
            train,
            dev=dev if args.dev else None,
            test=test if args.test else None
        )
    print '\nEnded at: {}'.format(datetime.now())
Exemple #5
0
                           default="")  # to write in
    argparser.add_argument("--results_file", type=str,
                           default="")  # to write in

    args = argparser.parse_args()
    print '\n', args, '\n'

    df = read_df(args.df_corpus)
    df = df.fillna(u'')

    label_tags = pickle.load(open(args.tags_file, 'rb'))

    raw_corpus = myio.read_corpus(args.corpus_w_tags, with_tags=True)

    embedding_layer = create_embedding_layer(n_d=10,
                                             embs=load_embedding_iterator(
                                                 args.embeddings),
                                             only_words=False)

    with tf.Session() as sess:

        myqrapi = TPAPI(args.model, embedding_layer, sess, len(label_tags),
                        args.layer)

        embedding_layer = myqrapi.model.embedding_layer

        ids_corpus = myio.map_corpus(raw_corpus,
                                     embedding_layer,
                                     label_tags,
                                     max_len=args.max_seq_len)

        print("vocab size={}, corpus size={}\n".format(embedding_layer.n_V,