Ejemplo n.º 1
0
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    train_data, val_data = load_preprocess_data(FLAGS.data_dir,
                                                FLAGS.max_context_len,
                                                FLAGS.max_question_len)

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)
    embeddings = tf.constant(load_embeddings(embed_path), tf.float32)

    encoder = Encoder(FLAGS.state_size, FLAGS.summary_flag,
                      FLAGS.max_context_len, FLAGS.max_question_len)
    decoder = Decoder(FLAGS.state_size, FLAGS.summary_flag)

    qa = QASystem(encoder, decoder, FLAGS, embeddings, rev_vocab)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, train_data, val_data, save_train_dir)
Ejemplo n.º 2
0
def main(_):
    #======Fill the model name=============
    train_dir = "train/test"
    #======================================
    vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path)
    embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))

    # ========= Load Dataset =========
    train_data,val_data  = load_and_preprocess_data(FLAGS.data_dir, FLAGS.max_context_len, FLAGS.max_question_len, size = FLAGS.train_size)
    # ========= Model-specific =========
    # You must change the following code to adjust to your model

    embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)
    embedding = tf.constant(load_embeddings(embed_path), dtype = tf.float32)
    encoder = Encoder(FLAGS.state_size, FLAGS.max_context_len, FLAGS.max_question_len, FLAGS.embedding_size, FLAGS.summary_flag, FLAGS.filter_flag)
    decoder = Decoder(FLAGS.state_size, FLAGS.max_context_len, FLAGS.max_question_len, FLAGS.output_size, FLAGS.summary_flag)
    qa = QASystem(encoder, decoder, FLAGS, embedding, rev_vocab)

    with tf.Session() as sess:
        train_dir = get_normalized_train_dir(train_dir)
        qa = initialize_model(sess, qa, train_dir)
        output_list, output_dict = generate_answers(sess, qa, val_data, rev_vocab)
        store_result(output_list, output_dict, train_dir)
Ejemplo n.º 3
0
def main():

    # read pre-trained embeddings
    embeddings = load_embeddings(embedding_path, 'word2vec')

    test_accus = []  # Collect test accuracy for each fold
    for i in xrange(n_folds):
        fold = i + 1
        logging.info('Fold {} of {}...'.format(fold, n_folds))
        # read data
        train_data, train_labels, test_data, test_labels, seq_len, vocab_size = load_data_MR(
            data_path, fold=fold)

        # update train directory according to fold number
        train_dir = base_train_dir + '/' + str(fold)
        # create train directory if not exist
        if not os.path.exists(train_dir):
            os.makedirs(train_dir)
        # create log file handler
        file_handler = logging.FileHandler(pjoin(train_dir, "log.txt"))
        logging.getLogger().addHandler(file_handler)

        # check whether the model has been trained, if not, create a new one
        if os.path.exists(train_dir + '/model.json'):
            # load json and create model
            json_file = open(train_dir + '/model.json', 'r')
            loaded_model_json = json_file.read()
            json_file.close()
            model = model_from_json(loaded_model_json)
            # load weights into new model
            model.load_weights(train_dir + "/model.h5")
            model.compile(loss={'output': 'binary_crossentropy'},
                          optimizer=Adadelta(lr=base_lr,
                                             epsilon=1e-6,
                                             decay=decay_rate),
                          metrics=["accuracy"])
            print("Loaded model from disk!")
        else:
            model = setup_model(embeddings, seq_len, vocab_size)
            print("Created a new model!")

        # train the model
        test_accu = train(model, train_data, train_labels, test_data,
                          test_labels, embeddings, train_dir)

        # log test accuracy result
        logging.info("\nTest Accuracy for fold {}: {}".format(fold, test_accu))
        test_accus.append(test_accu)

    # write log of test accuracy for all folds
    test_accu_log = open(base_train_dir + "/final_test_accuracy.txt", 'w')
    test_accu_log.write('\n'.join([
        'Fold {} Test Accuracy: {}'.format(fold, test_accu)
        for fold, test_accu in enumerate(test_accus)
    ]))
    test_accu_log.write('\nAvg test acc: {}'.format(np.mean(test_accus)))
Ejemplo n.º 4
0
def main(_):

    vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path)
    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    # ========= Load Dataset =========
    # You can change this code to load dataset in your own way

    dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path))
    dev_filename = os.path.basename(FLAGS.dev_path)
    context_data, question_data, question_uuid_data = prepare_dev(
        dev_dirname, dev_filename, vocab)
    dataset = (context_data, question_data, question_uuid_data)

    # ========= Model-specific =========
    # You must change the following code to adjust to your model

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)
    embedding = tf.constant(load_embeddings(embed_path), dtype=tf.float32)
    encoder = Encoder(FLAGS.state_size, FLAGS.max_context_len,
                      FLAGS.max_question_len, FLAGS.embedding_size,
                      FLAGS.summary_flag, FLAGS.filter_flag)
    decoder = Decoder(FLAGS.state_size, FLAGS.max_context_len,
                      FLAGS.max_question_len, FLAGS.output_size,
                      FLAGS.summary_flag)
    qa = QASystem(encoder, decoder, FLAGS, embedding, rev_vocab)

    with tf.Session() as sess:
        train_dir = get_normalized_train_dir(FLAGS.train_dir)
        initialize_model(sess, qa, train_dir)
        answers = generate_answers(sess, qa, dataset, rev_vocab)

        # write to json file to root dir
        with io.open('dev-prediction.json', 'w', encoding='utf-8') as f:
            f.write(unicode(json.dumps(answers, ensure_ascii=False)))
Ejemplo n.º 5
0
    def __init__(self, token_vocab: Vocabulary, tag_vocab: Vocabulary,
                 embeddings: Dict, encoder: Dict, tag_projection: Dict):
        super(NeuralCrf, self).__init__()
        self._embeddings = load_embeddings(**embeddings,
                                           token_vocab=token_vocab)
        self._encoder = load_object_from_dict(encoder)
        self._tag_projection = load_object_from_dict(tag_projection)

        self.token_vocab = token_vocab
        self.tag_vocab = tag_vocab
        self.num_tags = len(self.tag_vocab)
        assert self.num_tags == self._tag_projection.out_features
        self.crf = ConditionalRandomField(self.num_tags)

        self.metrics = {
            'accuracy': Accuracy(),
            'accuracy_per_label': AccuracyPerLabel(self.num_tags,
                                                   self.tag_vocab),
            'loss': Average()
        }
Ejemplo n.º 6
0
    def __init__(self, token_vocab: Vocabulary, tag_vocab: Vocabulary,
                 embeddings: Dict, encoder: Dict, tag_projection: Dict):
        super(SimpleTagger, self).__init__()
        self._embeddings = load_embeddings(**embeddings,
                                           token_vocab=token_vocab)
        self._encoder = load_object_from_dict(encoder)
        self._tag_projection = load_object_from_dict(tag_projection)

        self.token_vocab = token_vocab
        self.tag_vocab = tag_vocab
        self.num_tags = len(self.tag_vocab)
        assert self.num_tags == self._tag_projection.out_features

        self.loss = torch.nn.CrossEntropyLoss(
            ignore_index=self.tag_vocab.pad_token_id)
        self.metrics = {
            'accuracy': Accuracy(),
            'accuracy_per_label': AccuracyPerLabel(self.num_tags,
                                                   self.tag_vocab),
            'loss': Average()
        }
Ejemplo n.º 7
0
def main(args):
    # Set up logging
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False)
    log = util.get_logger(args.save_dir, args.name)
    log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True)))
    device, gpu_ids = util.get_available_devices()
    args.batch_size *= max(1, len(gpu_ids))

    # Get embeddings
    log.info('Loading embeddings...')
    embeddings = util.load_embeddings(args)

    # Get model
    log.info('Building model...')
    model = BiDAF(embeddings=embeddings,
                  hidden_size=args.hidden_size) if not args.use_slqa else SLQA(
                      embeddings=embeddings, hidden_size=args.hidden_size)
    model = nn.DataParallel(model, gpu_ids)
    log.info('Loading checkpoint from {}...'.format(args.load_path))
    model = util.load_model(model, args.load_path, gpu_ids, return_step=False)
    model = model.to(device)
    model.eval()

    # Get data loader
    log.info('Building dataset...')
    record_file = vars(args)['{}_record_file'.format(args.split)]
    dataset = SQuAD(record_file, args.use_squad_v2)
    data_loader = data.DataLoader(dataset,
                                  batch_size=args.batch_size,
                                  shuffle=False,
                                  num_workers=args.num_workers,
                                  collate_fn=collate_fn)

    # Evaluate
    log.info('Evaluating on {} split...'.format(args.split))
    nll_meter = util.AverageMeter()
    pred_dict = {}  # Predictions for TensorBoard
    sub_dict = {}  # Predictions for submission
    eval_file = vars(args)['{}_eval_file'.format(args.split)]
    with open(eval_file, 'r') as fh:
        gold_dict = json_load(fh)
    with torch.no_grad(), \
            tqdm(total=len(dataset)) as progress_bar:
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader:
            # Setup for forward
            cw_idxs = cw_idxs.to(device)
            qw_idxs = qw_idxs.to(device)
            batch_size = cw_idxs.size(0)
            if args.use_char_emb:
                cc_idxs = cc_idxs.to(device)
                qc_idxs = qc_idxs.to(device)
            else:
                cc_idx = None
                qc_idxs = None

            # Forward
            log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs)
            y1, y2 = y1.to(device), y2.to(device)
            loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
            nll_meter.update(loss.item(), batch_size)

            # Get F1 and EM scores
            p1, p2 = log_p1.exp(), log_p2.exp()
            starts, ends = util.discretize(p1, p2, args.max_ans_len,
                                           args.use_squad_v2)

            # Log info
            progress_bar.update(batch_size)
            if args.split != 'test':
                # No labels for the test set, so NLL would be invalid
                progress_bar.set_postfix(NLL=nll_meter.avg)

            idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(),
                                                      starts.tolist(),
                                                      ends.tolist(),
                                                      args.use_squad_v2)
            pred_dict.update(idx2pred)
            sub_dict.update(uuid2pred)

    # Log results (except for test set, since it does not come with labels)
    if args.split != 'test':
        results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2)
        breakpoint()
        results_list = [('NLL', nll_meter.avg), ('F1', results['F1']),
                        ('EM', results['EM'])]
        if args.use_squad_v2:
            results_list.append(('AvNA', results['AvNA']))
        results = OrderedDict(results_list)

        # Log to console
        results_str = ', '.join('{}: {:05.2f}'.format(k, v)
                                for k, v in results.items())
        log.info('{} {}'.format(args.split.title(), results_str))

        # Log to TensorBoard
        tbx = SummaryWriter(args.save_dir)
        util.visualize(tbx,
                       pred_dict=pred_dict,
                       eval_path=eval_file,
                       step=0,
                       split=args.split,
                       num_visuals=args.num_visuals)

    # Write submission file
    sub_path = join(args.save_dir, args.split + '_' + args.sub_file)
    log.info('Writing submission file to {}...'.format(sub_path))
    with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
        csv_writer = csv.writer(csv_fh, delimiter=',')
        csv_writer.writerow(['Id', 'Predicted'])
        for uuid in sorted(sub_dict):
            csv_writer.writerow([uuid, sub_dict[uuid]])
Ejemplo n.º 8
0
    tf.app.flags.DEFINE_float("keep_prob", 1.0, "Keep probability for dropout.")
    tf.app.flags.DEFINE_integer('checkpoint', 1000, 'number of batches until checkpoint.')
    tf.app.flags.DEFINE_integer('num_copies', 1, 'number of copies for associative RNN.')
    tf.app.flags.DEFINE_integer('num_read_keys', 0, 'number of additional read keys for associative RNN.')
    tf.app.flags.DEFINE_string("result_file", None, "Where to write results.")
    tf.app.flags.DEFINE_string("moru_ops", 'max,mul,keep,replace,diff,min,forget', "operations of moru cell.")
    tf.app.flags.DEFINE_string("moru_op_biases", None, "biases of moru operations at beginning of training. "
                                                       "Defaults to 0 for each.")
    tf.app.flags.DEFINE_integer("moru_op_ctr", None, "Size of op ctr. By default ops are controlled by current input"
                                                     "and previous state. Given a positive integer, an additional"
                                                     "recurrent op ctr is introduced in MORUCell.")
    tf.app.flags.DEFINE_boolean('eval', False, 'only evaluation')
    tf.app.flags.DEFINE_string('model_path', '/tmp/snli-model', 'only evaluation')
    tf.app.flags.DEFINE_string('device', '/gpu:0', 'device to run on')

    FLAGS = tf.app.flags.FLAGS

    kwargs = None
    if FLAGS.embedding_format == "glove":
        kwargs = {"vocab_size": 2196017, "dim": 300}

    print("Loading embeddings...")
    e = util.load_embeddings(FLAGS.embedding_file, FLAGS.embedding_format)
    print("Done.")


    import json
    print("Configuration: ")
    print(json.dumps(FLAGS.__flags, sort_keys=True, indent=2, separators=(',', ': ')))
    training(e, FLAGS)
Ejemplo n.º 9
0
def main(args):
    """Main entry point for training"""

    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info('Args: %s', format(dumps(vars(args), indent=4, sort_keys=True)))
    args.batch_size *= max(1, len(args.gpu_ids))

    _init_random_seed(args, log)

    # Get embeddings
    log.info('Loading embeddings...')
    embeddings = util.load_embeddings(args)

    # Get model
    log.info('Building model...')

    model = BiDAF(embeddings=embeddings,
                  hidden_size=args.hidden_size,
                  drop_prob=args.drop_prob) if not args.use_slqa else SLQA(
                      embeddings=embeddings,
                      hidden_size=args.hidden_size,
                      drop_prob=args.drop_prob)
    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info('Loading checkpoint from %s...', args.load_path)
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(),
                               args.lr,
                               weight_decay=args.l2_wd)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info('Starting epoch %s...', epoch)
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, dummy_ids in train_loader:
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                batch_size = cw_idxs.size(0)
                if args.use_char_emb:
                    cc_idxs = cc_idxs.to(device)
                    qc_idxs = qc_idxs.to(device)
                else:
                    cc_idxs = None
                    qc_idxs = None

                optimizer.zero_grad()

                # Forward
                log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs)
                y1, y2 = y1.to(device), y2.to(device)
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(),
                                         args.max_grad_norm)
                optimizer.step()
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info('Evaluating at step %s...', step)
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2,
                                                  args.use_char_emb)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join('{}: {:05.2f}'.format(k, v)
                                            for k, v in results.items())
                    log.info('Dev %s', results_str)

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar('dev/{}'.format(k), v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
Ejemplo n.º 10
0
def parse_args():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--prepare', action='store_true')
    parser.add_argument('--train', action='store_true')
    parser.add_argument('--test', action='store_true')
    return parser.parse_args()


if __name__ == '__main__':
    args = parse_args()
    if args.prepare:
        filter_glove()

    if args.train or args.test:
        weight_matrix, word_idx = load_embeddings(
            PathConfig.filtered_glove_path)

    if args.train:
        train_trees = load_trees(PathConfig.train_path)
        dev_trees = load_trees(PathConfig.dev_path)

    if args.test:
        test_trees = load_trees(PathConfig.test_path)

    if args.train:
        tree_lstm_model = TreeLSTMModel(weight_matrix, word_idx, ModelConfig)

        train_set = tree_lstm_model.compiler.build_loom_inputs(train_trees)

        dev_feed_dict = tree_lstm_model.compiler.build_feed_dict(dev_trees)
Ejemplo n.º 11
0
            state =np.zeros((1, model.config.state_size*model.config.n_layers))
            sent = 0
            with open('output/generated_{}_{:%Y%m%d_%H%M%S}.txt'.format(country_code, datetime.now()),'w') as f:
                f.write(word+' ')
                while sent < n_sents:
                    w_input, state = model.random_search(sess,w_input, state,top_n,word_in_vocab)
                    w_char = helper.id2tok[w_input]
                    print(w_char, end = ' ')
                    if w_char == '<EOS>':
                        f.write('\n')
                        sent+=1
                    else:
                        f.write(w_char+' ')
                    w_input = np.array([[w_input]])
                    word_in_vocab = True
                print('"')
            print('Generated text saved to output')            



if __name__ =="__main__":
    country_code ='FRA'
    helper, data_raw = util.load_and_preprocess_data(data_path='../input/un-general-debates.csv',country_code=country_code)
    embedding = util.load_embeddings(country_code)
    assert embedding is not None, 'No pretrained embeddings found. Use skipgram.py to train word embeddings'
    #do_train(country_code,embedding,helper, data_raw)
    generate_w_random_search(country_code,embedding, helper, 'palestine',20,5)
  
            
        
Ejemplo n.º 12
0
def do_train(train_bodies, train_stances, dimension, embedding_path, config, 
    max_headline_len=None, max_body_len=None, verbose=False, 
    include_stopwords=True, similarity_metric_feature=None, 
    weight_embeddings=False, idf=False):
    logging.info("Loading training and dev data ...")
    fnc_data, fnc_data_train, fnc_data_dev = util.load_and_preprocess_fnc_data(
        train_bodies, train_stances, include_stopwords, 
        similarity_metric_feature)
    logging.info("%d training examples", len(fnc_data_train.headlines))
    logging.info("%d dev examples", len(fnc_data_dev.headlines))
    if max_headline_len is None:
        max_headline_len = fnc_data_train.max_headline_len
    if max_body_len is None:
        max_body_len = fnc_data_train.max_body_len
    logging.info("Max headline length: %d", max_headline_len)
    logging.info("Max body length: %d", max_body_len)

    # For convenience, create the word indices map over the entire dataset
    logging.info("Building word-to-index map ...")
    corpus = ([w for bod in fnc_data.bodies for w in bod] +
        [w for headline in fnc_data.headlines for w in headline])
    word_indices = util.process_corpus(corpus)
    logging.info("Building embedding matrix ...")
    embeddings, known_words = util.load_embeddings(word_indices=word_indices,
        dimension=dimension, embedding_path=embedding_path,
        weight_embeddings=weight_embeddings)

    logging.info("Vectorizing data ...")
    # Vectorize and assemble the training data
    headline_vectors = util.vectorize(fnc_data_train.headlines, word_indices,
        known_words, max_headline_len)
    body_vectors = util.vectorize(fnc_data_train.bodies, word_indices,
        known_words, max_body_len)

    headlines_pc = bodies_pc = None
    if config.method == "arora":
        headlines_pc = util.arora_embeddings_pc(headline_vectors,
            embeddings)
        bodies_pc = util.arora_embeddings_pc(body_vectors,
            embeddings)
    else:
        headlines_pc = None
        bodies_pc = None

    if config.method == "vanilla_bag_of_words":
        logging.info("Precomputing training sentence embeddings ...")
        train_emb = embeddings
        if idf:
            train_emb = util.idf_embeddings(word_indices,
                headline_vectors + body_vectors, train_emb)
        headlines_emb = util.sentence_embeddings(headline_vectors, dimension,
            max_headline_len, train_emb)
        bodies_emb = util.sentence_embeddings(body_vectors, dimension,
            max_body_len, train_emb)
        training_data = [headlines_emb, bodies_emb, fnc_data_train.stances]
    else:
        training_data = [headline_vectors, body_vectors, fnc_data_train.stances]

    if similarity_metric_feature:
        training_data.append(fnc_data_train.sim_scores)
    training_data = zip(*training_data)

    # Vectorize and assemble the dev data; note that we use the training
    # maximum length
    dev_headline_vectors = util.vectorize(fnc_data_dev.headlines, word_indices,
        known_words, max_headline_len)
    dev_body_vectors = util.vectorize(fnc_data_dev.bodies, word_indices,
        known_words, max_body_len)

    if config.method == "vanilla_bag_of_words":
        logging.info("Precomputing dev sentence embeddings ...")
        test_emb = embeddings
        if idf:
            # TODO(akshayka): Experiment with using whole corpus as
            # documents vs just training vs just testing
            test_emb = util.idf_embeddings(word_indices,
                headline_vecotrs + dev_headline_vectors + body_vectors +
                dev_body_vectors, test_emb)
        dev_headlines_emb = util.sentence_embeddings(dev_headline_vectors,
            dimension, max_headline_len, test_emb)
        dev_bodies_emb = util.sentence_embeddings(dev_body_vectors,
            dimension, max_body_len, test_emb)
        dev_data = [dev_headlines_emb, dev_bodies_emb, fnc_data_dev.stances]
    else:
        dev_data = [dev_headline_vectors, dev_body_vectors,
            fnc_data_dev.stances]

    if similarity_metric_feature:
        dev_data.append(fnc_data_dev.sim_scores)
    dev_data = zip(*dev_data)

    with tf.Graph().as_default():
        logger.info("Building model...",)
        start = time.time()
        model = FNCModel(config, max_headline_len, max_body_len, embeddings,
            headlines_pc=headlines_pc, bodies_pc=bodies_pc, verbose=verbose)
        logger.info("took %.2f seconds", time.time() - start)

        init = tf.global_variables_initializer()
        saver = tf.train.Saver()
        with tf.Session() as session:
            session.run(init)
            logging.info('Fitting ...')
            model.fit(session, saver, training_data, dev_data)
            logging.info('Outputting ...')
            output = model.output(session, dev_data)

    indices_to_words = {word_indices[w] : w for w in word_indices}
    # TODO(akshayka): Please code-review this. In particular,
    # please validate whether dev_headline_vectors is an equivalent 
    # representation of output[0][0], and dev_body_vectors for output[0][1]
    headlines = [' '.join(
        util.word_indices_to_words(h, indices_to_words))
        for h in dev_headline_vectors]
    bodies = [' '.join(
        util.word_indices_to_words(b, indices_to_words))
        for b in dev_body_vectors]
    output = zip(headlines, bodies, output[1], output[2])

    with open(model.config.eval_output, 'w') as f, open(
        model.config.error_output, "w") as g:
        for headline, body, label, prediction in output:
            f.write("%s\t%s\tgold:%d\tpred:%d\n\n" % (
                headline, body, label, prediction))
            if label != prediction:
                g.write("%s\t%s\tgold:%d\tpred:%d\n\n" % (
                    headline, body, label, prediction))
Ejemplo n.º 13
0
    def _add_seq2seq(self):
        """Add the whole sequence-to-sequence model to the graph."""
        hps = self._hps
        vsize = self._vocab.size()  # size of the vocabulary

        with tf.variable_scope('seq2seq'):
            # Some initializers
            self.rand_unif_init = tf.random_uniform_initializer(
                -hps.rand_unif_init_mag, hps.rand_unif_init_mag, seed=123)
            self.trunc_norm_init = tf.truncated_normal_initializer(
                stddev=hps.trunc_norm_init_std)

            with tf.variable_scope('embedding'):
                if hps.pretrained_embeddings:
                    word2vec = load_embeddings(hps.embeddings_path,
                                               self._vocab.word2id,
                                               hps.rand_unif_init_mag)
                    self.embedding = tf.get_variable(
                        'embedding', [vsize, hps.emb_dim],
                        dtype=tf.float32,
                        initializer=tf.constant_initializer(word2vec))
                    # self.assign_embedding = tf.assign(self.embedding, word2vec)
                else:
                    self.embedding = tf.get_variable(
                        'embedding', [vsize, hps.emb_dim],
                        dtype=tf.float32,
                        initializer=self.trunc_norm_init)
                if hps.mode == "train":
                    self._add_emb_vis(self.embedding)  # add to tensorboard

                # tensor with shape (batch_size, max_enc_steps, emb_size)
                emb_enc_inputs = tf.nn.embedding_lookup(
                    self.embedding, self._enc_batch)
                if self._hps.hier:
                    enc_batch_sections = tf.unstack(self._enc_batch_sections,
                                                    axis=1)
                    sec_emb_enc_inputs = [
                        tf.nn.embedding_lookup(self.embedding, section)
                        for section in enc_batch_sections
                    ]
                # list length max_dec_steps containing shape (batch_size, emb_size)
                emb_dec_inputs = [
                    tf.nn.embedding_lookup(self.embedding, x)
                    for x in tf.unstack(self._dec_batch, axis=1)
                ]

            # Hierarchical attention model
            if self._hps.hier:
                with tf.variable_scope('encoder'), tf.device(
                        self._next_device()):
                    sec_enc_outs = []
                    states_fw = []
                    states_bw = []
                    states = []

                    # level 1, encode words to sections
                    with tf.variable_scope("word_level_encoder",
                                           reuse=tf.AUTO_REUSE) as scope:
                        encoder_outputs_words = []
                        cell_fw = tf.contrib.rnn.LSTMCell(
                            self._hps.hidden_dim,
                            initializer=self.rand_unif_init,
                            state_is_tuple=True)
                        cell_bw = tf.contrib.rnn.LSTMCell(
                            self._hps.hidden_dim,
                            initializer=self.rand_unif_init,
                            state_is_tuple=True)
                        fw_st, bw_st = None, None
                        if self._hps.use_do:  # DropOut
                            cell_fw = tf.contrib.rnn.DropoutWrapper(
                                cell_fw,
                                output_keep_prob=1.0 - self._hps.do_prob)
                            cell_bw = tf.contrib.rnn.DropoutWrapper(
                                cell_bw,
                                output_keep_prob=1.0 - self._hps.do_prob)
                        for i in range(self._hps.num_sections):
                            encoder_tmp_output, (
                                fw_st, bw_st
                            ) = tf.nn.bidirectional_dynamic_rnn(
                                cell_fw,
                                cell_bw,
                                inputs=sec_emb_enc_inputs[i],
                                dtype=tf.float32,
                                sequence_length=self._batch_sections_len[:, i],
                                swap_memory=True,
                                initial_state_bw=bw_st,
                                initial_state_fw=fw_st)
                            # concatenate the forwards and backwards states
                            encoder_tmp_output = tf.concat(
                                axis=2, values=encoder_tmp_output
                            )  #shape=[batch x seq_len x hidden_size]

                            encoder_outputs_words.append(encoder_tmp_output)
                            # instead of concating the fw and bw states, we use a ff network
                            combined_state = self._reduce_states(fw_st, bw_st)
                            states.append(combined_state)
                            scope.reuse_variables()

                    # level 2, encode sections to doc
                    encoder_outputs_words = tf.stack(
                        encoder_outputs_words, axis=1
                    )  # shape [batch x num_sections x seq_len x hidden_size]
                    shapes = encoder_outputs_words.shape
                    encoder_outputs_words = tf.reshape(
                        encoder_outputs_words,
                        (shapes[0].value, -1, shapes[-1].value)
                    )  #shape=[batch x (seq_len * num_sections) x hidden_size]

                    doc_sections_h = tf.stack(
                        [s.h for s in states],
                        axis=1)  # [batch x num_sections x hidden_size]
                    doc_sections_c = tf.stack(
                        [s.c for s in states],
                        axis=1)  # [batch x num_sections x hidden_size]

                    with tf.variable_scope("section_level_encoder"):
                        if FLAGS.section_level_encoder == 'RNN':
                            cell_fw_1 = tf.contrib.rnn.LSTMCell(
                                self._hps.hidden_dim,
                                initializer=self.rand_unif_init,
                                state_is_tuple=True)
                            cell_bw_1 = tf.contrib.rnn.LSTMCell(
                                self._hps.hidden_dim,
                                initializer=self.rand_unif_init,
                                state_is_tuple=True)
                            if self._hps.use_do:
                                cell_fw_1 = tf.contrib.rnn.DropoutWrapper(
                                    cell_fw_1,
                                    output_keep_prob=1.0 - self._hps.do_prob)
                                cell_bw_1 = tf.contrib.rnn.DropoutWrapper(
                                    cell_bw_1,
                                    output_keep_prob=1.0 - self._hps.do_prob)
                            encoder_output_sections, (fw_st_2, bw_st_2) =\
                                tf.nn.bidirectional_dynamic_rnn(cell_fw_1, cell_bw_1, inputs=doc_sections_h, sequence_length=self._doc_sec_lens, dtype=tf.float32, swap_memory=True)
                            encoder_output_sections = tf.concat(
                                axis=2, values=encoder_output_sections)
                            doc_sections_state = self._reduce_states(
                                fw_st_2, bw_st_2)
                        else:
                            if FLAGS.section_level_encoder == 'AVG':  # average section cells
                                doc_sections_state_h = tf.reduce_mean(
                                    doc_sections_h, axis=1)
                                doc_sections_state_c = tf.reduce_mean(
                                    doc_sections_c, axis=1)
                            elif FLAGS.section_level_encoder == 'FF':  # use a feedforward network to combine section cells
                                doc_sections_state_h = tf.reshape(
                                    [doc_sections_h.shape[0].eval(), -1])
                                doc_sections_state_h = tf.layers.dense(
                                    inputs=doc_sections_state_h,
                                    units=self._hps.hidden,
                                    activation=tf.nn.relu)
                                doc_sections_state_c = tf.reshape(
                                    [doc_sections_c.shape[0].eval(), -1])
                                doc_sections_state_c = tf.layers.dense(
                                    inputs=doc_sections_state_c,
                                    units=self._hps.hidden,
                                    activation=tf.nn.relu)
                            else:
                                raise AttributeError(
                                    'FLAGS.section_level_encoder={} is not a valid option'
                                    .format(FLAGS.section_level_encoder))
                            doc_sections_state = tf.contrib.rnn.LSTMStateTuple(
                                doc_sections_state_c, doc_sections_state_h)
                            encoder_output_sections = doc_sections_h

            elif not self._hps.multi_layer_encoder:
                with tf.variable_scope('encoder'):
                    with tf.variable_scope('word_level_encoder'):
                        cell_fw = tf.contrib.rnn.LSTMCell(
                            self._hps.hidden_dim,
                            initializer=self.rand_unif_init,
                            state_is_tuple=True)
                        cell_bw = tf.contrib.rnn.LSTMCell(
                            self._hps.hidden_dim,
                            initializer=self.rand_unif_init,
                            state_is_tuple=True)
                        (encoder_outputs, (fw_st, bw_st)) =\
                          tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs=emb_enc_inputs, dtype=tf.float32, sequence_length=self._enc_lens, swap_memory=True)
                        # concatenate the forwards and backwards states
                        encoder_outputs = tf.concat(axis=2,
                                                    values=encoder_outputs)

            # stack n layers of lstms for encoder
            elif self._hps.multi_layer_encoder:
                # TODO: check
                for layer_i in xrange(self._hps.enc_layers):
                    with tf.variable_scope('encoder%d' % layer_i), tf.device(
                            self._next_device()):
                        cell_fw = tf.contrib.rnn.LSTMCell(
                            self._hps.hidden_dim,
                            initializer=self.rand_unif_init,
                            state_is_tuple=True)
                        cell_bw = tf.contrib.rnn.LSTMCell(
                            self._hps.hidden_dim,
                            initializer=self.rand_unif_init,
                            state_is_tuple=True)
                        if self._hps.use_do:  # add dropout
                            cell_fw = tf.contrib.rnn.DropoutWrapper(
                                cell_fw,
                                output_keep_prob=1.0 - self._hps.do_prob)
                            cell_bw = tf.contrib.rnn.DropoutWrapper(
                                cell_bw,
                                output_keep_prob=1.0 - self._hps.do_prob)
                        emb_enc_inputs, (fw_st, bw_st) =\
                          tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs=emb_enc_inputs, dtype=tf.float32, sequence_length=self._enc_lens, swap_memory=True)
                        emb_enc_inputs = tf.concat(axis=2,
                                                   values=emb_enc_inputs)
                encoder_outputs = emb_enc_inputs

            if self._hps.hier:
                self._enc_sec_states = encoder_output_sections
                self._enc_states = encoder_outputs_words
            else:
                self._enc_states = encoder_outputs
                self._enc_sec_states = None

            # convert the encoder bidirectional hidden state to the decoder state
            # (unidirectional) by an MLP
            if self._hps.hier:
                self._dec_in_state = doc_sections_state
            else:
                with tf.variable_scope('encoder'):
                    with tf.variable_scope('word_level_encoder'):
                        self._dec_in_state = self._reduce_states(fw_st, bw_st)

            # Add the decoder

            with tf.variable_scope('decoder'), tf.device(self._next_device()):
                cell = tf.contrib.rnn.LSTMCell(self._hps.hidden_dim,
                                               state_is_tuple=True,
                                               initializer=self.rand_unif_init)

                # We need to pass in the previous step's coverage vector each time
                prev_coverage = self.prev_coverage\
                                 if hps.mode=="decode" and self._hps.coverage \
                                 else None

                if self._hps.hier:
                    decoder_outputs, self._dec_out_state, self.attn_dists, self.p_gens, self.coverage, self.attn_dists_sec =\
                      self.attn_decoder(emb_dec_inputs,
                                        self._dec_in_state,
                                        self._enc_states,
                                        cell,
                                        self._enc_sec_states,
                                        num_words_section=self._batch_sections_len,
                                        enc_padding_mask=self._enc_padding_mask,
                                        enc_section_padding_mask=self._enc_section_padding_mask,
                                        initial_state_attention=(self._hps.mode=="decode"),
                                        pointer_gen=self._hps.pointer_gen,
                                        use_coverage=self._hps.coverage,
                                        prev_coverage=prev_coverage,
                                        temperature=self._hps.temperature
                                        )

                else:
                    decoder_outputs, self._dec_out_state, self.attn_dists, self.p_gens, self.coverage, _ =\
                      self.attn_decoder(emb_dec_inputs,
                                        self._dec_in_state,
                                        self._enc_states,
                                        cell,
                                        encoder_section_states=None,
                                        num_words_section=None,
                                        enc_padding_mask=self._enc_padding_mask,
                                        initial_state_attention=(self._hps.mode=="decode"),
                                        pointer_gen=self._hps.pointer_gen,
                                        use_coverage=self._hps.coverage,
                                        prev_coverage=prev_coverage,
                                        )

            # Project decoder output to vocabulary
            with tf.variable_scope('output_projection'), tf.device(
                    self._next_device()):
                if self._hps.output_weight_sharing:
                    # share weights of embedding layer with projection
                    # self.embedding is in shape [vsize, hps.emb_dim]
                    w_proj = tf.get_variable(
                        'w_proj', [self._hps.emb_dim, self._hps.hidden_dim],
                        dtype=tf.float32,
                        initializer=self.trunc_norm_init)
                    w = tf.tanh(tf.transpose(
                        tf.matmul(self.embedding,
                                  w_proj)))  # shape = [vsize, hps.hidden_dim]

                    #         w_t = tf.transpose(w)
                    b = tf.get_variable('b', [vsize],
                                        dtype=tf.float32,
                                        initializer=self.trunc_norm_init)
                else:
                    w = tf.get_variable('w', [self._hps.hidden_dim, vsize],
                                        dtype=tf.float32,
                                        initializer=self.trunc_norm_init)
                    #         w_t = tf.transpose(w)
                    b = tf.get_variable('b', [vsize],
                                        dtype=tf.float32,
                                        initializer=self.trunc_norm_init)
                # vocabulary score at each decoder step
                vocab_scores = []
                for i, output in enumerate(decoder_outputs):
                    if i > 0:
                        tf.get_variable_scope().reuse_variables()
                    vocab_scores.append(tf.nn.xw_plus_b(
                        output, w, b))  # apply the linear layer

                # the final vocab distribution for each decoder time step
                # shape of each element is [batch_size, vsize]
                vocab_dists = [tf.nn.softmax(s) for s in vocab_scores]

            # pointing / generating
            if FLAGS.pointer_gen:
                final_dists = self._calc_final_dist(vocab_dists,
                                                    self.attn_dists)
#         log_dists = [tf.log(dist) for dist in final_dists]
            else:
                #         log_dists = [tf.log(dist) for dist in vocab_dists]
                final_dists = vocab_dists

            # Calculate Losses:

            if self._hps.mode in ['train', 'eval']:
                # Calculate the loss
                with tf.variable_scope('loss'), tf.device(self._next_device()):
                    if FLAGS.pointer_gen:
                        # Calculate the loss per step
                        # This is fiddly; we use tf.gather_nd to pick out the gold target words
                        # will be list length max_dec_steps containing shape (batch_size)
                        loss_per_step = []
                        batch_nums = tf.range(
                            0, limit=hps.batch_size)  # shape (batch_size)
                        for dec_step, dist in enumerate(final_dists):
                            # The indices of the target words. shape (batch_size)
                            targets = self._target_batch[:, dec_step]
                            indices = tf.stack((batch_nums, targets),
                                               axis=1)  # shape (batch_size, 2)
                            # shape (batch_size). loss on this step for each batch
                            gold_probs = tf.gather_nd(dist, indices)
                            losses = -tf.log(gold_probs)
                            loss_per_step.append(losses)

                        # Apply dec_padding_mask mask and get loss
                        self._loss = _mask_and_avg(loss_per_step,
                                                   self._dec_padding_mask)

                    else:  # baseline model
                        # this applies softmax internally
                        self._loss = tf.contrib.seq2seq.sequence_loss(
                            tf.stack(vocab_scores, axis=1), self._target_batch,
                            self._dec_padding_mask
                        )  # this applies softmax internally

                    tf.summary.scalar('loss', self._loss)

                    # Calculate coverage loss from the attention distributions
                    if self._hps.coverage:
                        with tf.variable_scope('coverage_loss'):
                            self._coverage_loss = _coverage_loss(
                                self.attn_dists, self._dec_padding_mask)
                            tf.summary.scalar('coverage_loss',
                                              self._coverage_loss)
                        self._total_loss = self._loss + self._hps.cov_loss_wt * self._coverage_loss
                        tf.summary.scalar('total_loss', self._total_loss)

                # ---------------------------/

        if self._hps.mode == "decode":
            assert len(
                final_dists
            ) == 1  # final_dists is a singleton list containing shape (batch_size, extended_vsize)
            final_dists = final_dists[0]
            topk_probs, self._topk_ids = tf.nn.top_k(
                final_dists, hps.batch_size * 2
            )  # take the k largest probs. note batch_size=beam_size in decode mode
            self._topk_log_probs = tf.log(topk_probs)