Beispiel #1
0
def show_squad_errors(answers):
    print("Loading answers..")
    answer_df = pd.read_csv(answers)

    print("Loading questions..")
    corpus = SquadCorpus()
    questions = {}
    docs = {}
    for doc in corpus.get_dev():
        for para in doc.paragraphs:
            for q in para.questions:
                questions[q.question_id] = q
                docs[q.question_id] = doc

    answer_df.sort_values(["question_id", "rank"], inplace=True)
    grouped = list(answer_df.groupby(["question_id"]))
    np.random.shuffle(grouped)

    for question_id, group in grouped:
        q = questions[question_id]
        doc = docs[question_id]
        cur_best_score = group.text_f1.iloc[0]
        cur_best_conf = group.predicted_score.iloc[0]
        cur_best_ix = group.index[0]
        for i in range(1, len(group)):
            ix = group.index[i]
            conf = group.predicted_score[ix]
            if conf > cur_best_conf:
                score = group.text_f1[ix]
                if score < cur_best_score:
                    # We hurt our selves!
                    print("Oh no!")
                    print(" ".join(q.words))
                    print(q.answer.answer_text)
                    print("Best score was %.4f (conf=%.4f), but not is %.4f (conf=%.4f)" % (
                        cur_best_score, cur_best_conf, score, conf
                    ))
                    cur_para = doc.paragraphs[group.para_number[cur_best_ix]]
                    new_para = doc.paragraphs[group.para_number[ix]]

                    p1_s, p1_e = group.predicted_start[cur_best_ix], group.predicted_end[cur_best_ix]
                    p2_s, p2_e = group.predicted_start[ix], group.predicted_end[ix]

                    print(" ".join(display_para(cur_para.get_context(), None, q.words, p1_s, p1_e)))
                    print()
                    print(" ".join(display_para(new_para.get_context(), None, q.words, p2_s, p2_e)))
                    input()
                else:
                    cur_best_score = score
                    cur_best_ix = ix
                    cur_best_conf = conf
def main():
    corpus = SquadCorpus()
    prepro = SquadTfIdfRanker(NltkPlusStopWords(True), OPTS.num_per_orig, True)
    orig_data = corpus.get_train(
    ) if OPTS.split == 'train' else corpus.get_dev()
    orig_lens = [
        len(p.text[0]) for doc in orig_data for p in doc.paragraphs
        for q in p.questions
    ]
    new_data = preprocess_par(orig_data,
                              corpus.evidence,
                              prepro,
                              n_processes=1)
    new_lens = [len(p.text) for q in new_data for p in q.paragraphs]
    print('%d original, mean %.2f words' %
          (len(orig_lens), np.mean(orig_lens)))
    print('%d new, mean %.2f words' % (len(new_lens), np.mean(new_lens)))
    if OPTS.out_file:
        write_output(OPTS.split, new_data, OPTS.out_file)
def main():
  corpus = SquadCorpus()
  if OPTS.normalize_before_ranking:
      normalizer = WordNormalizer()
  else:
      normalizer = None
  if OPTS.use_vec_dist:
    word_vecs = corpus.get_pruned_word_vecs('glove.840B.300d')
    prepro = SquadVectorTfIdfRanker(NltkPlusStopWords(True), OPTS.num_per_orig, True, word_vecs, word_normalizer=normalizer)
  else:
    prepro = SquadTfIdfRanker(NltkPlusStopWords(True), OPTS.num_per_orig, True, word_normalizer=normalizer)
  orig_data = corpus.get_train() if OPTS.split == 'train' else corpus.get_dev()
  orig_lens = [len(p.text[0]) for doc in orig_data for p in doc.paragraphs
               for q in p.questions] 
  new_data = preprocess_par(orig_data, corpus.evidence, prepro, n_processes=1)
  new_lens = [len(p.text) for q in new_data for p in q.paragraphs]
  print('%d original, mean %.2f words' % (len(orig_lens), np.mean(orig_lens)))
  print('%d new, mean %.2f words'% (len(new_lens), np.mean(new_lens)))
  if OPTS.out_file:
    write_output(OPTS.split, new_data, OPTS.out_file)
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser(description='Evaluate a model on SQuAD')
    parser.add_argument('model', help='model directory to evaluate')
    parser.add_argument("-o", "--official_output", type=str,
                        help="where to output an official result file")
    parser.add_argument('-n', '--sample_questions', type=int, default=None,
                        help="(for testing) run on a subset of questions")
    parser.add_argument('--answer_bounds', nargs='+', type=int, default=[17],
                        help="Max size of answer")
    parser.add_argument('-b', '--batch_size', type=int, default=200,
                        help="Batch size, larger sizes can be faster but uses more memory")
    parser.add_argument('-s', '--step', default=None,
                        help="Weights to load, can be a checkpoint step or 'latest'")
    # Add ja_test choice to test Multilingual QA dataset.
    parser.add_argument(
        '-c', '--corpus', choices=["dev", "train", "ja_test", "pred"], default="dev")
    parser.add_argument('--no_ema', action="store_true",
                        help="Don't use EMA weights even if they exist")
    # Add ja_test choice to test Multilingual QA pipeline.
    parser.add_argument('-p', '--pred_filepath', default=None,
                        help="The csv file path if you try pred mode")
    args = parser.parse_args()

    model_dir = ModelDir(args.model)

    corpus = SquadCorpus()
    if args.corpus == "dev":
        questions = corpus.get_dev()
    # Add ja_test choice to test Multilingual QA pipeline.
    elif args.corpus == "ja_test":
        questions = corpus.get_ja_test()
    # This is for prediction mode for MLQA pipeline.
    elif args.corpus == "pred":
        questions = create_pred_dataset(args.pred_filepath)
    else:
        questions = corpus.get_train()
    questions = split_docs(questions)

    if args.sample_questions:
        np.random.RandomState(0).shuffle(
            sorted(questions, key=lambda x: x.question_id))
        questions = questions[:args.sample_questions]

    questions.sort(key=lambda x: x.n_context_words, reverse=True)
    dataset = ParagraphAndQuestionDataset(
        questions, FixedOrderBatcher(args.batch_size, True))

    evaluators = [SpanEvaluator(args.answer_bounds, text_eval="squad")]
    if args.official_output is not None:
        evaluators.append(RecordSpanPrediction(args.answer_bounds[0]))

    if args.step is not None:
        if args.step == "latest":
            checkpoint = model_dir.get_latest_checkpoint()
        else:
            checkpoint = model_dir.get_checkpoint(int(args.step))
    else:
        checkpoint = model_dir.get_best_weights()
        if checkpoint is not None:
            print("Using best weights")
        else:
            print("Using latest checkpoint")
            checkpoint = model_dir.get_latest_checkpoint()

    model = model_dir.get_model()

    evaluation = trainer.test(model, evaluators, {args.corpus: dataset},
                              corpus.get_resource_loader(), checkpoint, not args.no_ema)[args.corpus]

    # Print the scalar results in a two column table
    scalars = evaluation.scalars
    cols = list(sorted(scalars.keys()))
    table = [cols]
    header = ["Metric", ""]
    table.append([("%s" % scalars[x] if x in scalars else "-") for x in cols])
    print_table([header] + transpose_lists(table))

    # Save the official output
    if args.official_output is not None:
        quid_to_para = {}
        for x in questions:
            quid_to_para[x.question_id] = x.paragraph

        q_id_to_answers = {}
        q_ids = evaluation.per_sample["question_id"]
        spans = evaluation.per_sample["predicted_span"]
        for q_id, (start, end) in zip(q_ids, spans):
            text = quid_to_para[q_id].get_original_text(start, end)
            q_id_to_answers[q_id] = text

        with open(args.official_output, "w") as f:
            json.dump(q_id_to_answers, f)
Beispiel #5
0
def main():
    parser = argparse.ArgumentParser(description='Evaluate a model on SQuAD')
    parser.add_argument('model', help='model directory to evaluate')
    parser.add_argument("-o", "--official_output", type=str, help="where to output an official result file")
    parser.add_argument('-n', '--sample_questions', type=int, default=None,
                        help="(for testing) run on a subset of questions")
    parser.add_argument('--answer_bounds', nargs='+', type=int, default=[17],
                        help="Max size of answer")
    parser.add_argument('-b', '--batch_size', type=int, default=200,
                        help="Batch size, larger sizes can be faster but uses more memory")
    parser.add_argument('-s', '--step', default=None,
                        help="Weights to load, can be a checkpoint step or 'latest'")
    parser.add_argument('-c', '--corpus', choices=["dev", "train"], default="dev")
    parser.add_argument('--no_ema', action="store_true", help="Don't use EMA weights even if they exist")
    parser.add_argument('--none_prob', action="store_true", help="Output none probability for samples")
    parser.add_argument('--elmo', action="store_true", help="Use elmo model")
    parser.add_argument('--per_question_loss_file', type=str, default=None,
            help="Run question by question and output a question_id -> loss output to this file")
    args = parser.parse_known_args()[0]

    model_dir = ModelDir(args.model)

    corpus = SquadCorpus()
    if args.corpus == "dev":
        questions = corpus.get_dev()
    else:
        questions = corpus.get_train()
    questions = split_docs(questions)

    if args.sample_questions:
        np.random.RandomState(0).shuffle(sorted(questions, key=lambda x: x.question_id))
        questions = questions[:args.sample_questions]

    questions.sort(key=lambda x:x.n_context_words, reverse=True)
    dataset = ParagraphAndQuestionDataset(questions, FixedOrderBatcher(args.batch_size, True))

    evaluators = [SpanEvaluator(args.answer_bounds, text_eval="squad")]
    if args.official_output is not None:
        evaluators.append(RecordSpanPrediction(args.answer_bounds[0]))
    if args.per_question_loss_file is not None:
        evaluators.append(RecordSpanPredictionScore(args.answer_bounds[0], args.batch_size, args.none_prob))

    if args.step is not None:
        if args.step == "latest":
            checkpoint = model_dir.get_latest_checkpoint()
        else:
            checkpoint = model_dir.get_checkpoint(int(args.step))
    else:
        checkpoint = model_dir.get_best_weights()
        if checkpoint is not None:
            print("Using best weights")
        else:
            print("Using latest checkpoint")
            checkpoint = model_dir.get_latest_checkpoint()

    model = model_dir.get_model()
    if args.elmo:
        model.lm_model.lm_vocab_file = './elmo-params/squad_train_dev_all_unique_tokens.txt'
        model.lm_model.options_file = './elmo-params/options_squad_lm_2x4096_512_2048cnn_2xhighway_skip.json'
        model.lm_model.weight_file = './elmo-params/squad_context_concat_lm_2x4096_512_2048cnn_2xhighway_skip.hdf5'
        model.lm_model.embed_weights_file = None


    evaluation = trainer.test(model, evaluators, {args.corpus: dataset},
                              corpus.get_resource_loader(), checkpoint, not args.no_ema)[args.corpus]

    # Print the scalar results in a two column table
    scalars = evaluation.scalars
    cols = list(sorted(scalars.keys()))
    table = [cols]
    header = ["Metric", ""]
    table.append([("%s" % scalars[x] if x in scalars else "-") for x in cols])
    print_table([header] + transpose_lists(table))

    # Save the official output
    if args.official_output is not None:
        quid_to_para = {}
        for x in questions:
            quid_to_para[x.question_id] = x.paragraph

        q_id_to_answers = {}
        q_ids = evaluation.per_sample["question_id"]
        spans = evaluation.per_sample["predicted_span"]
        for q_id, (start, end) in zip(q_ids, spans):
            text = quid_to_para[q_id].get_original_text(start, end)
            q_id_to_answers[q_id] = text

        with open(args.official_output, "w") as f:
            json.dump(q_id_to_answers, f)

    if args.per_question_loss_file is not None:
        print("Saving result")
        output_file = args.per_question_loss_file
        ids = evaluation.per_sample["question_ids"]
        f1s = evaluation.per_sample["text_f1"]
        ems = evaluation.per_sample["text_em"]
        losses = evaluation.per_sample["loss"]

        if args.none_prob:
            none_probs = evaluation.per_sample["none_probs"]
            """
            results = {question_id: {'f1': float(f1), 'em': float(em), 'loss': float(loss), 'none_prob': float(none_prob)} for question_id, f1, em, loss, none_prob in zip(ids, f1s, ems, losses, none_probs)}
            """
            results = {question_id: float(none_prob) for question_id, none_prob in zip(ids, none_probs)}
        else:
            results = {question_id: {'f1': float(f1), 'em': float(em), 'loss': float(loss)} for question_id, f1, em, loss in zip(ids, f1s, ems, losses)}


        with open(output_file, 'w') as f:
            json.dump(results, f)