Example #1
0
def main():
    parser = argparse.ArgumentParser("Train our ELMo model on SQuAD")
    parser.add_argument("loss_mode", choices=['default', 'confidence'])
    parser.add_argument("output_dir")
    parser.add_argument("--dim", type=int, default=90)
    parser.add_argument("--l2", type=float, default=0)
    parser.add_argument("--mode",
                        choices=["input", "output", "both", "none"],
                        default="both")
    parser.add_argument("--top_layer_only", action="store_true")
    parser.add_argument("--no-tfidf",
                        action='store_true',
                        help="Don't add TF-IDF negative examples")
    args = parser.parse_args()

    out = args.output_dir + "-" + datetime.now().strftime("%m%d-%H%M%S")

    dim = args.dim
    recurrent_layer = CudnnGru(dim, w_init=TruncatedNormal(stddev=0.05))

    if args.loss_mode == 'default':
        n_epochs = 24
        answer_encoder = SingleSpanAnswerEncoder()
        predictor = BoundsPredictor(
            ChainBiMapper(first_layer=recurrent_layer,
                          second_layer=recurrent_layer))
        batcher = ClusteredBatcher(45, ContextLenKey(), False, False)
        data = DocumentQaTrainingData(SquadCorpus(), None, batcher, batcher)
    elif args.loss_mode == 'confidence':
        if args.no_tfidf:
            prepro = SquadDefault()
            n_epochs = 15
        else:
            prepro = SquadTfIdfRanker(NltkPlusStopWords(True), 4, True)
            n_epochs = 50
        answer_encoder = DenseMultiSpanAnswerEncoder()
        predictor = ConfidencePredictor(ChainBiMapper(
            first_layer=recurrent_layer,
            second_layer=recurrent_layer,
        ),
                                        AttentionEncoder(),
                                        FullyConnected(80, activation="tanh"),
                                        aggregate="sum")
        eval_dataset = RandomParagraphSetDatasetBuilder(
            100, 'flatten', True, 0)
        train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3), True,
                                          False)
        data = PreprocessedData(SquadCorpus(),
                                prepro,
                                StratifyParagraphsBuilder(train_batching, 1),
                                eval_dataset,
                                eval_on_verified=False)
        data.preprocess(1)

    params = trainer.TrainParams(trainer.SerializableOptimizer(
        "Adadelta", dict(learning_rate=1.0)),
                                 ema=0.999,
                                 max_checkpoints_to_keep=2,
                                 async_encoding=10,
                                 num_epochs=n_epochs,
                                 log_period=30,
                                 eval_period=1200,
                                 save_period=1200,
                                 best_weights=("dev", "b17/text-f1"),
                                 eval_samples=dict(dev=None, train=8000))

    lm_reduce = MapperSeq(
        ElmoLayer(args.l2,
                  layer_norm=False,
                  top_layer_only=args.top_layer_only),
        DropoutLayer(0.5),
    )
    model = AttentionWithElmo(
        encoder=DocumentAndQuestionEncoder(answer_encoder),
        lm_model=SquadContextConcatSkip(),
        append_before_atten=(args.mode == "both" or args.mode == "output"),
        append_embed=(args.mode == "both" or args.mode == "input"),
        max_batch_size=128,
        word_embed=FixedWordEmbedder(vec_name="glove.840B.300d",
                                     word_vec_init_scale=0,
                                     learn_unk=False,
                                     cpu=True),
        char_embed=CharWordEmbedder(LearnedCharEmbedder(word_size_th=14,
                                                        char_th=49,
                                                        char_dim=20,
                                                        init_scale=0.05,
                                                        force_cpu=True),
                                    MaxPool(Conv1d(100, 5, 0.8)),
                                    shared_parameters=True),
        embed_mapper=SequenceMapperSeq(
            VariationalDropoutLayer(0.8),
            recurrent_layer,
            VariationalDropoutLayer(0.8),
        ),
        lm_reduce=None,
        lm_reduce_shared=lm_reduce,
        per_sentence=False,
        memory_builder=NullBiMapper(),
        attention=BiAttention(TriLinear(bias=True), True),
        match_encoder=SequenceMapperSeq(
            FullyConnected(dim * 2, activation="relu"),
            ResidualLayer(
                SequenceMapperSeq(
                    VariationalDropoutLayer(0.8),
                    recurrent_layer,
                    VariationalDropoutLayer(0.8),
                    StaticAttentionSelf(TriLinear(bias=True),
                                        ConcatWithProduct()),
                    FullyConnected(dim * 2, activation="relu"),
                )), VariationalDropoutLayer(0.8)),
        predictor=predictor)

    with open(__file__, "r") as f:
        notes = f.read()
        notes = str(sorted(args.__dict__.items(),
                           key=lambda x: x[0])) + "\n" + notes

    trainer.start_training(
        data, model, params,
        [LossEvaluator(),
         SpanEvaluator(bound=[17], text_eval="squad")], ModelDir(out), notes)
def run():
    parser = argparse.ArgumentParser()
    parser.add_argument("squad_path", help="path to squad dev data file")
    parser.add_argument("output_path",
                        help="path where evaluation json file will be written")
    parser.add_argument("--model-path",
                        default="model",
                        help="path to model directory")
    parser.add_argument("--n", type=int, default=None)
    parser.add_argument("-b", "--batch_size", type=int, default=100)
    parser.add_argument("--ema", action="store_true")
    args = parser.parse_args()

    squad_path = args.squad_path
    output_path = args.output_path
    model_dir = ModelDir(args.model_path)
    nltk.data.path.append("nltk_data")

    print("Loading data")
    docs = parse_squad_data(squad_path, "", NltkAndPunctTokenizer(), False)
    pairs = split_docs(docs)
    dataset = ParagraphAndQuestionDataset(
        pairs, ClusteredBatcher(args.batch_size, ContextLenKey(), False, True))

    print("Done, init model")
    model = model_dir.get_model()
    loader = ResourceLoader(lambda a, b: load_word_vector_file(
        join(VEC_DIR, "glove.840B.300d.txt"), b))
    lm_model = model.lm_model
    basedir = join(LM_DIR, "squad-context-concat-skip")
    lm_model.lm_vocab_file = join(basedir,
                                  "squad_train_dev_all_unique_tokens.txt")
    lm_model.options_file = join(
        basedir, "options_squad_lm_2x4096_512_2048cnn_2xhighway_skip.json")
    lm_model.weight_file = join(
        basedir,
        "squad_context_concat_lm_2x4096_512_2048cnn_2xhighway_skip.hdf5")
    lm_model.embed_weights_file = None

    model.set_inputs([dataset], loader)

    print("Done, building graph")
    sess = tf.Session()
    with sess.as_default():
        pred = model.get_prediction()
    best_span = pred.get_best_span(17)[0]

    all_vars = tf.global_variables() + tf.get_collection(
        tf.GraphKeys.SAVEABLE_OBJECTS)
    dont_restore_names = {
        x.name
        for x in all_vars if x.name.startswith("bilm")
    }
    print(sorted(dont_restore_names))
    vars = [x for x in all_vars if x.name not in dont_restore_names]

    print("Done, loading weights")
    checkpoint = model_dir.get_best_weights()
    if checkpoint is None:
        print("Loading most recent checkpoint")
        checkpoint = model_dir.get_latest_checkpoint()
    else:
        print("Loading best weights")

    saver = tf.train.Saver(vars)
    saver.restore(sess, checkpoint)

    if args.ema:
        ema = tf.train.ExponentialMovingAverage(0)
        saver = tf.train.Saver(
            {ema.average_name(x): x
             for x in tf.trainable_variables()})
        saver.restore(sess, checkpoint)

    sess.run(
        tf.variables_initializer(
            [x for x in all_vars if x.name in dont_restore_names]))

    print("Done, starting evaluation")
    out = {}
    for i, batch in enumerate(dataset.get_epoch()):
        if args.n is not None and i == args.n:
            break
        print("On batch: %d" % (i + 1))
        enc = model.encode(batch, False)
        spans = sess.run(best_span, feed_dict=enc)
        for (s, e), point in zip(spans, batch):
            out[point.question_id] = point.get_original_text(s, e)

    sess.close()

    print("Done, saving")
    with open(output_path, "w") as f:
        json.dump(out, f)

    print("Mission accomplished!")
def main():
    parser = argparse.ArgumentParser(description='Train a model on TriviaQA web')
    parser.add_argument('mode', choices=["paragraph-level", "confidence", "merge",
                                         "shared-norm", "sigmoid", "shared-norm-600"])
    parser.add_argument("name", help="Where to store the model")
    parser.add_argument('-n', '--n_processes', type=int, default=2,
                        help="Number of processes (i.e., select which paragraphs to train on) "
                             "the data with")
    args = parser.parse_args()
    mode = args.mode

    out = args.name + "-" + datetime.now().strftime("%m%d-%H%M%S")

    model = get_model(100, 140, mode, WithIndicators())

    stop = NltkPlusStopWords(True)

    if mode == "paragraph-level":
        extract = ExtractSingleParagraph(MergeParagraphs(400), TopTfIdf(stop, 1), model.preprocessor, intern=True)
    elif mode == "shared-norm-600":
        extract = ExtractMultiParagraphs(MergeParagraphs(600), TopTfIdf(stop, 4), model.preprocessor, intern=True)
    else:
        extract = ExtractMultiParagraphs(MergeParagraphs(400), TopTfIdf(stop, 4), model.preprocessor, intern=True)
    
    if mode == "paragraph-level":
        n_epochs = 16
        train = ParagraphAndQuestionsBuilder(ClusteredBatcher(60, ContextLenBucketedKey(3), True))
        test = ParagraphAndQuestionsBuilder(ClusteredBatcher(60, ContextLenKey(), False))
        n_dev, n_train = 21000, 12000
        eval = [LossEvaluator(), SpanEvaluator([4, 8], "triviaqa")]
    else:
        eval = [LossEvaluator(), MultiParagraphSpanEvaluator(8, "triviaqa", mode != "merge")]
        # we sample two paragraphs per a (question, doc) pair, so evaluate on fewer questions
        n_dev, n_train = 15000, 8000

        if mode == "confidence" or mode == "sigmoid":
            if mode == "sigmoid":
                # Trains very slowly, do this at your own risk
                n_epochs = 71
            else:
                n_epochs = 28
            test = RandomParagraphSetDatasetBuilder(120, "flatten", True, 1)
            train = StratifyParagraphsBuilder(ClusteredBatcher(60, ContextLenBucketedKey(3), True), 0, 1)
        else:
            n_epochs = 14
            test = RandomParagraphSetDatasetBuilder(120, "merge" if mode == "merge" else "group", True, 1)
            train = StratifyParagraphSetsBuilder(35, mode == "merge", True, 1)

    data = TriviaQaWebDataset()

    params = get_triviaqa_train_params(n_epochs, n_dev, n_train)

    data = PreprocessedData(data, extract, train, test, eval_on_verified=False)

    data.preprocess(args.n_processes, 1000)

    with open(__file__, "r") as f:
        notes = f.read()
    notes = "*" * 10 + "\nMode: " + args.mode + "\n" + "*"*10 + "\n" + notes

    trainer.start_training(data, model, params, eval, model_dir.ModelDir(out), notes)
Example #4
0
def main():
    parser = argparse.ArgumentParser(
        description='Train a model on document-level SQuAD')
    parser.add_argument(
        'mode',
        choices=["paragraph", "confidence", "shared-norm", "merge", "sigmoid"])
    parser.add_argument("name", help="Output directory")
    args = parser.parse_args()
    mode = args.mode
    out = args.name + "-" + datetime.now().strftime("%m%d-%H%M%S")

    corpus = SquadCorpus()
    if mode == "merge":
        # Adds paragraph start tokens, since we will be concatenating paragraphs together
        pre = WithIndicators(True, para_tokens=False, doc_start_token=False)
    else:
        pre = None

    model = get_model(50, 100, args.mode, pre)

    if mode == "paragraph":
        # Run in the "standard" known-paragraph setting
        if model.preprocessor is not None:
            raise NotImplementedError()
        n_epochs = 26
        train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3), True,
                                          False)
        eval_batching = ClusteredBatcher(45, ContextLenKey(), False, False)
        data = DocumentQaTrainingData(corpus, None, train_batching,
                                      eval_batching)
        eval = [LossEvaluator(), SpanEvaluator(bound=[17], text_eval="squad")]
    else:
        eval_set_mode = {
            "confidence": "flatten",
            "sigmoid": "flatten",
            "shared-norm": "group",
            "merge": "merge"
        }[mode]
        eval_dataset = RandomParagraphSetDatasetBuilder(
            100, eval_set_mode, True, 0)

        if mode == "confidence" or mode == "sigmoid":
            if mode == "sigmoid":
                # needs to be trained for a really long time for reasons unknown, even this might be too small
                n_epochs = 100
            else:
                n_epochs = 50  # more epochs since we only "see" the label very other epoch-osh
            train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3),
                                              True, False)
            data = PreprocessedData(
                SquadCorpus(),
                SquadTfIdfRanker(NltkPlusStopWords(True), 4, True,
                                 model.preprocessor),
                StratifyParagraphsBuilder(train_batching, 1),
                eval_dataset,
                eval_on_verified=False,
            )
        else:
            n_epochs = 26
            data = PreprocessedData(
                SquadCorpus(),
                SquadTfIdfRanker(NltkPlusStopWords(True), 4, True,
                                 model.preprocessor),
                StratifyParagraphSetsBuilder(25, args.mode == "merge", True,
                                             1),
                eval_dataset,
                eval_on_verified=False,
            )

        eval = [LossEvaluator(), MultiParagraphSpanEvaluator(17, "squad")]
        data.preprocess(1)

    with open(__file__, "r") as f:
        notes = f.read()
        notes = args.mode + "\n" + notes

    trainer.start_training(data, model, train_params(n_epochs), eval,
                           model_dir.ModelDir(out), notes)
Example #5
0
def main():
    """
    A close-as-possible impelemntation of BiDaF, its based on the `dev` tensorflow 1.1 branch of Ming's repo
    which, in particular, uses Adam not Adadelta. I was not able to replicate the results in paper using Adadelta,
    but with Adam i was able to get to 78.0 F1 on the dev set with this scripts. I believe this approach is
    an exact reproduction up the code in the repo, up to initializations.

    Notes: Exponential Moving Average is very important, as is early stopping. This is also in particualr best run
    on a GPU due to the large number of parameters and batch size involved.
    """
    out = get_output_name_from_cli()

    train_params = TrainParams(SerializableOptimizer(
        "Adam", dict(learning_rate=0.001)),
                               num_epochs=12,
                               ema=0.999,
                               async_encoding=10,
                               log_period=30,
                               eval_period=1000,
                               save_period=1000,
                               eval_samples=dict(dev=None, train=8000))

    # recurrent_layer = BiRecurrentMapper(LstmCellSpec(100, keep_probs=0.8))
    # recurrent_layer = FusedLstm()
    recurrent_layer = SequenceMapperSeq(DropoutLayer(0.8), CudnnLstm(100))

    model = Attention(
        encoder=DocumentAndQuestionEncoder(SingleSpanAnswerEncoder()),
        word_embed=FixedWordEmbedder(vec_name="glove.6B.100d",
                                     word_vec_init_scale=0,
                                     learn_unk=False),
        char_embed=CharWordEmbedder(embedder=LearnedCharEmbedder(16, 49, 8),
                                    layer=ReduceLayer("max",
                                                      Conv1d(100, 5, 0.8),
                                                      mask=False),
                                    shared_parameters=True),
        word_embed_layer=None,
        embed_mapper=SequenceMapperSeq(HighwayLayer(activation="relu"),
                                       HighwayLayer(activation="relu"),
                                       recurrent_layer),
        preprocess=None,
        question_mapper=None,
        context_mapper=None,
        memory_builder=NullBiMapper(),
        attention=BiAttention(TriLinear(bias=True), True),
        match_encoder=NullMapper(),
        predictor=BoundsPredictor(
            ChainConcat(start_layer=SequenceMapperSeq(recurrent_layer,
                                                      recurrent_layer),
                        end_layer=recurrent_layer)),
    )

    with open(__file__, "r") as f:
        notes = f.read()

    eval = [LossEvaluator(), SpanEvaluator(bound=[17], text_eval="squad")]

    corpus = SquadCorpus()
    train_batching = ClusteredBatcher(60, ContextLenBucketedKey(3), True,
                                      False)
    eval_batching = ClusteredBatcher(60, ContextLenKey(), False, False)
    data = DocumentQaTrainingData(corpus, None, train_batching, eval_batching)

    trainer.start_training(data, model, train_params, eval,
                           model_dir.ModelDir(out), notes)
def main():
    parser = argparse.ArgumentParser("Train rejection model on SQuAD")

    parser.add_argument("--corpus_dir", type=str, default="~/data/document-qa")
    parser.add_argument("--output_dir",
                        type=str,
                        default="~/model/document-qa/squad")
    parser.add_argument("--lm_dir", type=str, default="~/data/lm")
    parser.add_argument("--exp_id", type=str, default="rejection")

    parser.add_argument("--lr", type=float, default=0.5)
    parser.add_argument("--epoch", type=int, default=20)

    parser.add_argument("--dim", type=int, default=100)
    parser.add_argument("--batch_size", type=int, default=45)

    parser.add_argument("--l2", type=float, default=0)
    parser.add_argument("--mode",
                        choices=["input", "output", "both", "none"],
                        default="both")
    parser.add_argument("--top_layer_only", action="store_true")

    args = parser.parse_args()

    print("Arguments : ", args)

    out = args.output_dir + "_" + args.exp_id + "_lr" + str(
        args.lr) + "-" + datetime.now().strftime("%m%d-%H%M%S")
    dim = args.dim
    batch_size = args.batch_size
    out = expanduser(out)
    lm_dir = expanduser(args.lm_dir)
    corpus_dir = expanduser(args.corpus_dir)

    print("Make global recurrent_layer...")
    recurrent_layer = CudnnGru(
        dim, w_init=tf.keras.initializers.TruncatedNormal(stddev=0.05))
    params = trainer.TrainParams(trainer.SerializableOptimizer(
        "Adadelta", dict(learning_rate=args.lr)),
                                 ema=0.999,
                                 max_checkpoints_to_keep=2,
                                 async_encoding=10,
                                 num_epochs=args.epoch,
                                 log_period=30,
                                 eval_period=1200,
                                 save_period=1200,
                                 best_weights=("dev", "b17/text-f1"),
                                 eval_samples=dict(dev=None, train=8000))

    lm_reduce = MapperSeq(
        ElmoLayer(args.l2,
                  layer_norm=False,
                  top_layer_only=args.top_layer_only),
        DropoutLayer(0.5),
    )

    model = AttentionWithElmo(
        encoder=DocumentAndQuestionEncoder(SingleSpanAnswerEncoder()),
        lm_model=SquadContextConcatSkip(lm_dir=lm_dir),
        append_before_atten=(args.mode == "both" or args.mode == "output"),
        append_embed=(args.mode == "both" or args.mode == "input"),
        max_batch_size=128,
        word_embed=FixedWordEmbedder(vec_name="glove.840B.300d",
                                     word_vec_init_scale=0,
                                     learn_unk=False,
                                     cpu=True),
        char_embed=CharWordEmbedder(LearnedCharEmbedder(word_size_th=14,
                                                        char_th=49,
                                                        char_dim=20,
                                                        init_scale=0.05,
                                                        force_cpu=True),
                                    MaxPool(Conv1d(100, 5, 0.8)),
                                    shared_parameters=True),
        embed_mapper=SequenceMapperSeq(
            VariationalDropoutLayer(0.8),
            recurrent_layer,
            VariationalDropoutLayer(0.8),
        ),
        lm_reduce=None,
        lm_reduce_shared=lm_reduce,
        per_sentence=False,
        memory_builder=NullBiMapper(),
        attention=BiAttention(TriLinear(bias=True), True),
        match_encoder=SequenceMapperSeq(
            FullyConnected(dim * 2, activation="relu"),
            ResidualLayer(
                SequenceMapperSeq(
                    VariationalDropoutLayer(0.8),
                    recurrent_layer,
                    VariationalDropoutLayer(0.8),
                    StaticAttentionSelf(TriLinear(bias=True),
                                        ConcatWithProduct()),
                    FullyConnected(dim * 2, activation="relu"),
                )), VariationalDropoutLayer(0.8)),
        predictor=BoundsPredictor(
            ChainBiMapper(first_layer=recurrent_layer,
                          second_layer=recurrent_layer)))

    batcher = ClusteredBatcher(batch_size, ContextLenKey(), False, False)
    data = DocumentQaTrainingData(SquadCorpus(corpus_dir), None, batcher,
                                  batcher)

    with open(__file__, "r") as f:
        notes = f.read()
        notes = str(sorted(args.__dict__.items(),
                           key=lambda x: x[0])) + "\n" + notes

    trainer.start_training(
        data, model, params,
        [LossEvaluator(),
         SpanEvaluator(bound=[17], text_eval="squad")], ModelDir(out), notes)
def run():
    parser = argparse.ArgumentParser()
    parser.add_argument("input_data")
    parser.add_argument("output_data")

    parser.add_argument("--plot_dir", type=str, default=None)

    parser.add_argument("--model_dir", type=str, default="/tmp/model/document-qa")
    parser.add_argument("--lm_dir", type=str, default="/home/castle/data/lm/squad-context-concat-skip")
    parser.add_argument("--glove_dir", type=str, default="/home/castle/data/glove")

    parser.add_argument("--n", type=int, default=None)
    parser.add_argument("-b", "--batch_size", type=int, default=30)
    parser.add_argument("--ema", action="store_true")
    args = parser.parse_args()

    input_data = args.input_data
    output_path = args.output_data
    model_dir = ModelDir(args.model_dir)
    nltk.data.path.append("nltk_data")

    print("Loading data")
    docs = parse_squad_data(input_data, "", NltkAndPunctTokenizer(), False)
    pairs = split_docs(docs)
    dataset = ParagraphAndQuestionDataset(pairs, ClusteredBatcher(args.batch_size, ContextLenKey(), False, True))

    print("Done, init model")
    model = model_dir.get_model()
    # small hack, just load the vector file at its expected location rather then using the config location
    loader = ResourceLoader(lambda a, b: load_word_vector_file(join(args.glove_dir, "glove.840B.300d.txt"), b))
    lm_model = model.lm_model
    basedir = args.lm_dir
    plotdir = args.plot_dir

    lm_model.lm_vocab_file = join(basedir, "squad_train_dev_all_unique_tokens.txt")
    lm_model.options_file = join(basedir, "options_squad_lm_2x4096_512_2048cnn_2xhighway_skip.json")
    lm_model.weight_file = join(basedir, "squad_context_concat_lm_2x4096_512_2048cnn_2xhighway_skip.hdf5")
    lm_model.embed_weights_file = None

    model.set_inputs([dataset], loader)

    print("Done, building graph")
    sess = tf.Session()
    with sess.as_default():
        pred = model.get_prediction()
    best_span = pred.get_best_span(17)[0]

    if plotdir != None:
        start_logits_op, end_logits_op = pred.get_logits()

    all_vars = tf.global_variables() + tf.get_collection(tf.GraphKeys.SAVEABLE_OBJECTS)
    dont_restore_names = {x.name for x in all_vars if x.name.startswith("bilm")}
    print(sorted(dont_restore_names))
    vars = [x for x in all_vars if x.name not in dont_restore_names]

    print("Done, loading weights")
    checkpoint = model_dir.get_best_weights()
    if checkpoint is None:
        print("Loading most recent checkpoint")
        checkpoint = model_dir.get_latest_checkpoint()
    else:
        print("Loading best weights")

    saver = tf.train.Saver(vars)
    saver.restore(sess, checkpoint)

    if args.ema:
        ema = tf.train.ExponentialMovingAverage(0)
        saver = tf.train.Saver({ema.average_name(x): x for x in tf.trainable_variables()})
        saver.restore(sess, checkpoint)

    sess.run(tf.variables_initializer([x for x in all_vars if x.name in dont_restore_names]))

    print("Done, starting evaluation")
    out = {}
    for i, batch in enumerate(dataset.get_epoch()):
        if args.n is not None and i == args.n:
            break
        print("On batch size [%d], now in %d th batch" % (args.batch_size, i +1))
        enc = model.encode(batch, False)
        if plotdir != None:
            spans, start_logits, end_logits = sess.run([best_span, start_logits_op, end_logits_op], feed_dict=enc)
            for bi, point in enumerate(batch):
                q = ' '.join(point.question)
                c = point.paragraph.get_context()
                gt = ' | '.join(point.answer.answer_text)
                s, e = spans[bi]
                pred = point.get_original_text(s, e)
                start_dist = start_logits[bi]
                end_dist = end_logits[bi]
                c_interval = np.arange(0.0, start_dist.shape[0], 1)
                c_label = c
                plt.figure(1)
                plt.subplot(211)
                plt.plot(c_interval, start_dist, color='r')
                plt.title("Q : " + q + " // A : " + gt, fontsize=9)
                plt.text(0, 0, r'Predict : %s [%d:%d]' % (pred, s, e), color='b')
                axes = plt.gca()
                axes.set_ylim([-20, 20])

                plt.subplot(212)
                plt.plot(c_interval, end_dist, color='g')
                plt.xticks(c_interval, c_label, rotation=90, fontsize=5)
                axes = plt.gca()
                axes.set_ylim([-20, 20])
                plt.show()

            break
        else:
            spans = sess.run(best_span, feed_dict=enc)

        for (s, e), point in zip(spans, batch):
            out[point.question_id] = point.get_original_text(s, e)

    sess.close()

    print("Done, saving")
    with open(output_path, "w") as f:
        json.dump(out, f)

    print("Mission accomplished!")
Example #8
0
def main():
    parser = argparse.ArgumentParser(description='Evaluate a model on SQuAD')
    parser.add_argument('model', help='model directory to evaluate')
    parser.add_argument("-o", "--official_output", type=str, help="where to output an official result file")
    parser.add_argument('-n', '--sample_questions', type=int, default=None,
                        help="(for testing) run on a subset of questions")
    parser.add_argument('--answer_bounds', nargs='+', type=int, default=[17],
                        help="Max size of answer")
    parser.add_argument('-b', '--batch_size', type=int, default=45,
                        help="Batch size, larger sizes can be faster but uses more memory")
    parser.add_argument('-s', '--step', default=None,
                        help="Weights to load, can be a checkpoint step or 'latest'")
    parser.add_argument('-c', '--corpus', choices=["dev", "train"], default="dev")
    parser.add_argument('--no_ema', action="store_true", help="Don't use EMA weights even if they exist")
    args = parser.parse_args()

    num_choices = 4

    model_dir = ModelDir(args.model)

    corpus = SquadCorpus()
    if args.corpus == "dev":
        questions = corpus.get_dev()
    else:
        questions = corpus.get_train()
    questions = split_docs(questions)

    if args.sample_questions:
        np.random.RandomState(0).shuffle(sorted(questions, key=lambda x: x.question_id))
        questions = questions[:args.sample_questions]


    questions.sort(key=lambda x:x.n_context_words, reverse=True)
    #pdb.set_trace()
    #print(args.batch_size)
    #dataset = ParagraphAndQuestionDataset(questions, FixedOrderBatcher(args.batch_size, False),None,num_choices)


    dataset = ParagraphAndQuestionDataset(questions, ClusteredBatcher(45, ContextLenKey(), False, False),None,num_choices)
    
    #ClusteredBatcher(45, ContextLenKey(), False, False)

    evaluators = [MultiChoiceEvaluator(num_choices)]
    #if args.official_output is not None:
        #evaluators.append(RecordSpanPrediction(args.answer_bounds[0]))
    #pdb.set_trace()
    if args.step is not None:
        if args.step == "latest":
            checkpoint = model_dir.get_latest_checkpoint()
        else:
            checkpoint = model_dir.get_checkpoint(int(args.step))
    else:
        checkpoint = model_dir.get_best_weights()
        if checkpoint is not None:
            print("Using best weights")
        else:
            print("Using latest checkpoint")
            checkpoint = model_dir.get_latest_checkpoint()

    model = model_dir.get_model()
    #pdb.set_trace()
    evaluation = trainer.test(model, evaluators, {args.corpus: dataset},
                              corpus.get_resource_loader(), checkpoint, not args.no_ema)[args.corpus]
    
    #pdb.set_trace()
    
    # Print the scalar results in a two column table
    scalars = evaluation.scalars
    cols = list(sorted(scalars.keys()))
    table = [cols]
    header = ["Metric", ""]
    table.append([("%s" % scalars[x] if x in scalars else "-") for x in cols])
    print_table([header] + transpose_lists(table))

    # Save the official output
    if args.official_output is not None:
        data_to_dump = {}

        list_of_choices = ['A','B','C','D']

        q_ids = evaluation.per_sample["question_id"]
        correct_ans = evaluation.per_sample["correct answer"]
        correct_ids = evaluation.per_sample["correct index"]
        pred_ids = evaluation.per_sample["predictied index"]
        pred_ans = evaluation.per_sample["predictied answer"]
        is_correct  = evaluation.per_sample["is correct"]
        #pdb.set_trace()
        for ix, q_ids in enumerate(q_ids):
            if(is_correct[ix]):
                data_to_dump[q_ids] = {'Is Correct' : 'True',
                 'predictied' : [' '.join(pred_ans[ix]),list_of_choices[pred_ids[ix]]],
                 'correct' : [' '.join(correct_ans[ix]),list_of_choices[correct_ids[ix]]]
                } 
            else:
                data_to_dump[q_ids] = {'Is Correct' : 'False',
                 'predictied' : [' '.join(pred_ans[ix]),list_of_choices[pred_ids[ix]]],
                 'correct' : [' '.join(correct_ans[ix]),list_of_choices[correct_ids[ix]]]
                } 
        #pdb.set_trace()
        with open(args.official_output, "w") as f:
            json.dump(data_to_dump , f)