Exemple #1
0
def main():
    parser = argparse.ArgumentParser(description='Evaluate a model on SQuAD')
    parser.add_argument('model', help='model directory to evaluate')
    parser.add_argument("-o", "--official_output", type=str,
                        help="where to output an official result file")
    parser.add_argument('-n', '--sample_questions', type=int, default=None,
                        help="(for testing) run on a subset of questions")
    parser.add_argument('--answer_bounds', nargs='+', type=int, default=[17],
                        help="Max size of answer")
    parser.add_argument('-b', '--batch_size', type=int, default=200,
                        help="Batch size, larger sizes can be faster but uses more memory")
    parser.add_argument('-s', '--step', default=None,
                        help="Weights to load, can be a checkpoint step or 'latest'")
    # Add ja_test choice to test Multilingual QA dataset.
    parser.add_argument(
        '-c', '--corpus', choices=["dev", "train", "ja_test", "pred"], default="dev")
    parser.add_argument('--no_ema', action="store_true",
                        help="Don't use EMA weights even if they exist")
    # Add ja_test choice to test Multilingual QA pipeline.
    parser.add_argument('-p', '--pred_filepath', default=None,
                        help="The csv file path if you try pred mode")
    args = parser.parse_args()

    model_dir = ModelDir(args.model)

    corpus = SquadCorpus()
    if args.corpus == "dev":
        questions = corpus.get_dev()
    # Add ja_test choice to test Multilingual QA pipeline.
    elif args.corpus == "ja_test":
        questions = corpus.get_ja_test()
    # This is for prediction mode for MLQA pipeline.
    elif args.corpus == "pred":
        questions = create_pred_dataset(args.pred_filepath)
    else:
        questions = corpus.get_train()
    questions = split_docs(questions)

    if args.sample_questions:
        np.random.RandomState(0).shuffle(
            sorted(questions, key=lambda x: x.question_id))
        questions = questions[:args.sample_questions]

    questions.sort(key=lambda x: x.n_context_words, reverse=True)
    dataset = ParagraphAndQuestionDataset(
        questions, FixedOrderBatcher(args.batch_size, True))

    evaluators = [SpanEvaluator(args.answer_bounds, text_eval="squad")]
    if args.official_output is not None:
        evaluators.append(RecordSpanPrediction(args.answer_bounds[0]))

    if args.step is not None:
        if args.step == "latest":
            checkpoint = model_dir.get_latest_checkpoint()
        else:
            checkpoint = model_dir.get_checkpoint(int(args.step))
    else:
        checkpoint = model_dir.get_best_weights()
        if checkpoint is not None:
            print("Using best weights")
        else:
            print("Using latest checkpoint")
            checkpoint = model_dir.get_latest_checkpoint()

    model = model_dir.get_model()

    evaluation = trainer.test(model, evaluators, {args.corpus: dataset},
                              corpus.get_resource_loader(), checkpoint, not args.no_ema)[args.corpus]

    # Print the scalar results in a two column table
    scalars = evaluation.scalars
    cols = list(sorted(scalars.keys()))
    table = [cols]
    header = ["Metric", ""]
    table.append([("%s" % scalars[x] if x in scalars else "-") for x in cols])
    print_table([header] + transpose_lists(table))

    # Save the official output
    if args.official_output is not None:
        quid_to_para = {}
        for x in questions:
            quid_to_para[x.question_id] = x.paragraph

        q_id_to_answers = {}
        q_ids = evaluation.per_sample["question_id"]
        spans = evaluation.per_sample["predicted_span"]
        for q_id, (start, end) in zip(q_ids, spans):
            text = quid_to_para[q_id].get_original_text(start, end)
            q_id_to_answers[q_id] = text

        with open(args.official_output, "w") as f:
            json.dump(q_id_to_answers, f)
def run():
    parser = argparse.ArgumentParser()
    parser.add_argument("squad_path", help="path to squad dev data file")
    parser.add_argument("output_path",
                        help="path where evaluation json file will be written")
    parser.add_argument("--model-path",
                        default="model",
                        help="path to model directory")
    parser.add_argument("--n", type=int, default=None)
    parser.add_argument("-b", "--batch_size", type=int, default=100)
    parser.add_argument("--ema", action="store_true")
    args = parser.parse_args()

    squad_path = args.squad_path
    output_path = args.output_path
    model_dir = ModelDir(args.model_path)
    nltk.data.path.append("nltk_data")

    print("Loading data")
    docs = parse_squad_data(squad_path, "", NltkAndPunctTokenizer(), False)
    pairs = split_docs(docs)
    dataset = ParagraphAndQuestionDataset(
        pairs, ClusteredBatcher(args.batch_size, ContextLenKey(), False, True))

    print("Done, init model")
    model = model_dir.get_model()
    loader = ResourceLoader(lambda a, b: load_word_vector_file(
        join(VEC_DIR, "glove.840B.300d.txt"), b))
    lm_model = model.lm_model
    basedir = join(LM_DIR, "squad-context-concat-skip")
    lm_model.lm_vocab_file = join(basedir,
                                  "squad_train_dev_all_unique_tokens.txt")
    lm_model.options_file = join(
        basedir, "options_squad_lm_2x4096_512_2048cnn_2xhighway_skip.json")
    lm_model.weight_file = join(
        basedir,
        "squad_context_concat_lm_2x4096_512_2048cnn_2xhighway_skip.hdf5")
    lm_model.embed_weights_file = None

    model.set_inputs([dataset], loader)

    print("Done, building graph")
    sess = tf.Session()
    with sess.as_default():
        pred = model.get_prediction()
    best_span = pred.get_best_span(17)[0]

    all_vars = tf.global_variables() + tf.get_collection(
        tf.GraphKeys.SAVEABLE_OBJECTS)
    dont_restore_names = {
        x.name
        for x in all_vars if x.name.startswith("bilm")
    }
    print(sorted(dont_restore_names))
    vars = [x for x in all_vars if x.name not in dont_restore_names]

    print("Done, loading weights")
    checkpoint = model_dir.get_best_weights()
    if checkpoint is None:
        print("Loading most recent checkpoint")
        checkpoint = model_dir.get_latest_checkpoint()
    else:
        print("Loading best weights")

    saver = tf.train.Saver(vars)
    saver.restore(sess, checkpoint)

    if args.ema:
        ema = tf.train.ExponentialMovingAverage(0)
        saver = tf.train.Saver(
            {ema.average_name(x): x
             for x in tf.trainable_variables()})
        saver.restore(sess, checkpoint)

    sess.run(
        tf.variables_initializer(
            [x for x in all_vars if x.name in dont_restore_names]))

    print("Done, starting evaluation")
    out = {}
    for i, batch in enumerate(dataset.get_epoch()):
        if args.n is not None and i == args.n:
            break
        print("On batch: %d" % (i + 1))
        enc = model.encode(batch, False)
        spans = sess.run(best_span, feed_dict=enc)
        for (s, e), point in zip(spans, batch):
            out[point.question_id] = point.get_original_text(s, e)

    sess.close()

    print("Done, saving")
    with open(output_path, "w") as f:
        json.dump(out, f)

    print("Mission accomplished!")
def main():
    data = split_docs(SquadCorpus().get_train())
    np.random.shuffle(data)
    for point in data:
        print(" ".join(point.question))
def main():
    parser = argparse.ArgumentParser(description='Evaluate a model on SQuAD')
    parser.add_argument('model', help='model directory to evaluate')
    parser.add_argument("-o", "--official_output", type=str, help="where to output an official result file")
    parser.add_argument('-n', '--sample_questions', type=int, default=None,
                        help="(for testing) run on a subset of questions")
    parser.add_argument('--answer_bounds', nargs='+', type=int, default=[17],
                        help="Max size of answer")
    parser.add_argument('-b', '--batch_size', type=int, default=200,
                        help="Batch size, larger sizes can be faster but uses more memory")
    parser.add_argument('-s', '--step', default=None,
                        help="Weights to load, can be a checkpoint step or 'latest'")
    parser.add_argument('-c', '--corpus', choices=["dev", "train"], default="dev")
    parser.add_argument('--no_ema', action="store_true", help="Don't use EMA weights even if they exist")
    parser.add_argument('--none_prob', action="store_true", help="Output none probability for samples")
    parser.add_argument('--elmo', action="store_true", help="Use elmo model")
    parser.add_argument('--per_question_loss_file', type=str, default=None,
            help="Run question by question and output a question_id -> loss output to this file")
    args = parser.parse_known_args()[0]

    model_dir = ModelDir(args.model)

    corpus = SquadCorpus()
    if args.corpus == "dev":
        questions = corpus.get_dev()
    else:
        questions = corpus.get_train()
    questions = split_docs(questions)

    if args.sample_questions:
        np.random.RandomState(0).shuffle(sorted(questions, key=lambda x: x.question_id))
        questions = questions[:args.sample_questions]

    questions.sort(key=lambda x:x.n_context_words, reverse=True)
    dataset = ParagraphAndQuestionDataset(questions, FixedOrderBatcher(args.batch_size, True))

    evaluators = [SpanEvaluator(args.answer_bounds, text_eval="squad")]
    if args.official_output is not None:
        evaluators.append(RecordSpanPrediction(args.answer_bounds[0]))
    if args.per_question_loss_file is not None:
        evaluators.append(RecordSpanPredictionScore(args.answer_bounds[0], args.batch_size, args.none_prob))

    if args.step is not None:
        if args.step == "latest":
            checkpoint = model_dir.get_latest_checkpoint()
        else:
            checkpoint = model_dir.get_checkpoint(int(args.step))
    else:
        checkpoint = model_dir.get_best_weights()
        if checkpoint is not None:
            print("Using best weights")
        else:
            print("Using latest checkpoint")
            checkpoint = model_dir.get_latest_checkpoint()

    model = model_dir.get_model()
    if args.elmo:
        model.lm_model.lm_vocab_file = './elmo-params/squad_train_dev_all_unique_tokens.txt'
        model.lm_model.options_file = './elmo-params/options_squad_lm_2x4096_512_2048cnn_2xhighway_skip.json'
        model.lm_model.weight_file = './elmo-params/squad_context_concat_lm_2x4096_512_2048cnn_2xhighway_skip.hdf5'
        model.lm_model.embed_weights_file = None


    evaluation = trainer.test(model, evaluators, {args.corpus: dataset},
                              corpus.get_resource_loader(), checkpoint, not args.no_ema)[args.corpus]

    # Print the scalar results in a two column table
    scalars = evaluation.scalars
    cols = list(sorted(scalars.keys()))
    table = [cols]
    header = ["Metric", ""]
    table.append([("%s" % scalars[x] if x in scalars else "-") for x in cols])
    print_table([header] + transpose_lists(table))

    # Save the official output
    if args.official_output is not None:
        quid_to_para = {}
        for x in questions:
            quid_to_para[x.question_id] = x.paragraph

        q_id_to_answers = {}
        q_ids = evaluation.per_sample["question_id"]
        spans = evaluation.per_sample["predicted_span"]
        for q_id, (start, end) in zip(q_ids, spans):
            text = quid_to_para[q_id].get_original_text(start, end)
            q_id_to_answers[q_id] = text

        with open(args.official_output, "w") as f:
            json.dump(q_id_to_answers, f)

    if args.per_question_loss_file is not None:
        print("Saving result")
        output_file = args.per_question_loss_file
        ids = evaluation.per_sample["question_ids"]
        f1s = evaluation.per_sample["text_f1"]
        ems = evaluation.per_sample["text_em"]
        losses = evaluation.per_sample["loss"]

        if args.none_prob:
            none_probs = evaluation.per_sample["none_probs"]
            """
            results = {question_id: {'f1': float(f1), 'em': float(em), 'loss': float(loss), 'none_prob': float(none_prob)} for question_id, f1, em, loss, none_prob in zip(ids, f1s, ems, losses, none_probs)}
            """
            results = {question_id: float(none_prob) for question_id, none_prob in zip(ids, none_probs)}
        else:
            results = {question_id: {'f1': float(f1), 'em': float(em), 'loss': float(loss)} for question_id, f1, em, loss in zip(ids, f1s, ems, losses)}


        with open(output_file, 'w') as f:
            json.dump(results, f)
def run():
    parser = argparse.ArgumentParser()
    parser.add_argument("input_data")
    parser.add_argument("output_data")

    parser.add_argument("--plot_dir", type=str, default=None)

    parser.add_argument("--model_dir", type=str, default="/tmp/model/document-qa")
    parser.add_argument("--lm_dir", type=str, default="/home/castle/data/lm/squad-context-concat-skip")
    parser.add_argument("--glove_dir", type=str, default="/home/castle/data/glove")

    parser.add_argument("--n", type=int, default=None)
    parser.add_argument("-b", "--batch_size", type=int, default=30)
    parser.add_argument("--ema", action="store_true")
    args = parser.parse_args()

    input_data = args.input_data
    output_path = args.output_data
    model_dir = ModelDir(args.model_dir)
    nltk.data.path.append("nltk_data")

    print("Loading data")
    docs = parse_squad_data(input_data, "", NltkAndPunctTokenizer(), False)
    pairs = split_docs(docs)
    dataset = ParagraphAndQuestionDataset(pairs, ClusteredBatcher(args.batch_size, ContextLenKey(), False, True))

    print("Done, init model")
    model = model_dir.get_model()
    # small hack, just load the vector file at its expected location rather then using the config location
    loader = ResourceLoader(lambda a, b: load_word_vector_file(join(args.glove_dir, "glove.840B.300d.txt"), b))
    lm_model = model.lm_model
    basedir = args.lm_dir
    plotdir = args.plot_dir

    lm_model.lm_vocab_file = join(basedir, "squad_train_dev_all_unique_tokens.txt")
    lm_model.options_file = join(basedir, "options_squad_lm_2x4096_512_2048cnn_2xhighway_skip.json")
    lm_model.weight_file = join(basedir, "squad_context_concat_lm_2x4096_512_2048cnn_2xhighway_skip.hdf5")
    lm_model.embed_weights_file = None

    model.set_inputs([dataset], loader)

    print("Done, building graph")
    sess = tf.Session()
    with sess.as_default():
        pred = model.get_prediction()
    best_span = pred.get_best_span(17)[0]

    if plotdir != None:
        start_logits_op, end_logits_op = pred.get_logits()

    all_vars = tf.global_variables() + tf.get_collection(tf.GraphKeys.SAVEABLE_OBJECTS)
    dont_restore_names = {x.name for x in all_vars if x.name.startswith("bilm")}
    print(sorted(dont_restore_names))
    vars = [x for x in all_vars if x.name not in dont_restore_names]

    print("Done, loading weights")
    checkpoint = model_dir.get_best_weights()
    if checkpoint is None:
        print("Loading most recent checkpoint")
        checkpoint = model_dir.get_latest_checkpoint()
    else:
        print("Loading best weights")

    saver = tf.train.Saver(vars)
    saver.restore(sess, checkpoint)

    if args.ema:
        ema = tf.train.ExponentialMovingAverage(0)
        saver = tf.train.Saver({ema.average_name(x): x for x in tf.trainable_variables()})
        saver.restore(sess, checkpoint)

    sess.run(tf.variables_initializer([x for x in all_vars if x.name in dont_restore_names]))

    print("Done, starting evaluation")
    out = {}
    for i, batch in enumerate(dataset.get_epoch()):
        if args.n is not None and i == args.n:
            break
        print("On batch size [%d], now in %d th batch" % (args.batch_size, i +1))
        enc = model.encode(batch, False)
        if plotdir != None:
            spans, start_logits, end_logits = sess.run([best_span, start_logits_op, end_logits_op], feed_dict=enc)
            for bi, point in enumerate(batch):
                q = ' '.join(point.question)
                c = point.paragraph.get_context()
                gt = ' | '.join(point.answer.answer_text)
                s, e = spans[bi]
                pred = point.get_original_text(s, e)
                start_dist = start_logits[bi]
                end_dist = end_logits[bi]
                c_interval = np.arange(0.0, start_dist.shape[0], 1)
                c_label = c
                plt.figure(1)
                plt.subplot(211)
                plt.plot(c_interval, start_dist, color='r')
                plt.title("Q : " + q + " // A : " + gt, fontsize=9)
                plt.text(0, 0, r'Predict : %s [%d:%d]' % (pred, s, e), color='b')
                axes = plt.gca()
                axes.set_ylim([-20, 20])

                plt.subplot(212)
                plt.plot(c_interval, end_dist, color='g')
                plt.xticks(c_interval, c_label, rotation=90, fontsize=5)
                axes = plt.gca()
                axes.set_ylim([-20, 20])
                plt.show()

            break
        else:
            spans = sess.run(best_span, feed_dict=enc)

        for (s, e), point in zip(spans, batch):
            out[point.question_id] = point.get_original_text(s, e)

    sess.close()

    print("Done, saving")
    with open(output_path, "w") as f:
        json.dump(out, f)

    print("Mission accomplished!")