Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser("Create a Squad dataset for relevance prediction")
    parser.add_argument("--train-file", default=config.SQUAD_TRAIN_FILE)
    parser.add_argument("--dev-file", default=config.SQUAD_DEV_FILE)
    parser.add_argument("--doc-db", default=config.DOC_DB)
    parser.add_argument("--full-doc-db", default=config.FULL_DOC_DB)
    parser.add_argument("--ranker", default=config.TFIDF_FILE)
    parser.add_argument('--num-workers', type=int, default=1, help='Number of CPU processes')

    if not exists(join(config.CORPUS_DIR, 'squad')):
        mkdir(join(config.CORPUS_DIR, 'squad'))

    args = parser.parse_args()

    # target_dir = config.CORPUS_DIR
    # if exists(target_dir) and len(listdir(target_dir)) > 0:
    #     raise ValueError("Files already exist in " + target_dir)
    if args.num_workers > 1:
        print(f"Multiprocessing with {args.num_workers} threads...")
        print("Parsing train...")
        train_docs, train_qs = build_squad_data_async(args.train_file, args.doc_db, args.full_doc_db, args.ranker,
                                                      args.num_workers)
        print("Parsing dev...")
        dev_docs, dev_qs = build_squad_data_async(args.dev_file, args.doc_db, args.full_doc_db, args.ranker,
                                                  args.num_workers)
    else:
        print("Parsing train...")
        train_docs, train_qs = build_squad_data_sync(args.train_file, args.doc_db, args.full_doc_db, args.ranker)

        print("Parsing dev...")
        dev_docs, dev_qs = build_squad_data_sync(args.dev_file, args.doc_db, args.full_doc_db, args.ranker)

    print("Saving...")
    SquadRelevanceCorpus.make_corpus(train_docs, train_qs, dev_docs, dev_qs)
    print("Done")
Esempio n. 2
0
def main():
    parser = argparse.ArgumentParser(description='Evaluate tf-idf scoring on full squad.')
    parser.add_argument('--ranker', action='store_true', help='Whether to use bi-gram hashing or not')
    parser.add_argument('--per-doc', action='store_true')
    parser.add_argument('--num-workers', type=int, default=1)
    args = parser.parse_args()

    ranker = None
    if args.ranker:
        print("Loading ranker...")
        ranker = TfidfDocRanker()

    if args.per_doc:
        return main_for_document(ranker, args.num_workers)

    print("Loading data...")
    corpus = SquadRelevanceCorpus()
    # if args.corpus == "dev":
    #     questions = corpus.get_dev()
    # else:
    #     questions = corpus.get_train()
    questions = corpus.get_dev()

    question_preprocessor = SquadTextLengthPreprocessor(600)
    questions = [question_preprocessor.preprocess(x) for x in questions
                 if (question_preprocessor.preprocess(x) is not None)]

    if args.num_workers <= 1:
        if args.ranker:
            init()
        gold_ranks = [get_rank_in_distractors(q) for q in tqdm(questions)]
    else:
        # Setup worker pool
        workers = ProcessPool(
            args.num_workers,
            initializer=init if args.ranker else None,
            initargs=[]
        )

        gold_ranks = []
        with tqdm(total=len(questions)) as pbar:
            for rank in tqdm(workers.imap_unordered(get_rank_in_distractors, questions)):
                gold_ranks.append(rank)
                pbar.update()

    mean_rank = np.mean(gold_ranks)
    precision_at_1 = Counter(gold_ranks)[1]/len(gold_ranks)

    print(f"Mean Rank: {mean_rank}")
    print(f"Precision @ 1: {precision_at_1}")
Esempio n. 3
0
def build_squad_elmo(vocab_file, embd_file):
    corpus = SquadRelevanceCorpus()
    some_batcher = ClusteredBatcher(64, multiple_contexts_len, truncate_batches=True)
    data = SquadBinaryRelevanceTrainingData(corpus=corpus, train_batcher=some_batcher, dev_batcher=some_batcher,
                                            sample_filter=None, preprocessor=None,
                                            sample_train=None, sample_dev=None, sample_seed=18)
    build_vocab_from_preprocessed(data, vocab_file, embd_file)
Esempio n. 4
0
def main_for_document(use_ranker, num_workers):
    print("Loading data...")
    corpus = SquadRelevanceCorpus()
    # if args.corpus == "dev":
    #     questions = corpus.get_dev()
    # else:
    #     questions = corpus.get_train()
    questions = corpus.get_dev()

    question_preprocessor = SquadTextLengthPreprocessor(600)
    questions = [question_preprocessor.preprocess(x) for x in questions
                 if (question_preprocessor.preprocess(x) is not None)]

    title2max = {key: max(x.paragraph.par_id for x in group) for key, group in
                 itertools.groupby(sorted(questions, key=lambda x: x.paragraph.doc_title),
                                   key=lambda x: x.paragraph.doc_title)}

    if num_workers <= 1:
        if use_ranker:
            init()
        gold_ranks = [get_rank_in_document(q,
                                           corpus.dev_title_to_document[q.paragraph.doc_title].
                                           paragraphs[:title2max[q.paragraph.doc_title]+1]) for q in tqdm(questions)]

    else:
        # Setup worker pool
        workers = ProcessPool(
            num_workers,
            initializer=init if use_ranker else None,
            initargs=[]
        )

        data = [(q, corpus.dev_title_to_document[q.paragraph.doc_title].paragraphs[:title2max[q.paragraph.doc_title]+1])
                for q in questions]

        gold_ranks = []
        with tqdm(total=len(questions)) as pbar:
            for rank in tqdm(workers.imap_unordered(get_rank_in_document_async, data)):
                gold_ranks.append(rank)
                pbar.update()

    mean_rank = np.mean(gold_ranks)
    precision_at_1 = Counter(gold_ranks)[1]/len(gold_ranks)

    print(f"Mean Rank: {mean_rank}")
    print(f"Precision @ 1: {precision_at_1}")
Esempio n. 5
0
def encode_all_squad(encoder_model: str):
    print("loading data...")
    corpus = SquadRelevanceCorpus()
    train = corpus.get_train()
    dev = corpus.get_dev()

    spec = QuestionAndParagraphsSpec(batch_size=None,
                                     max_num_contexts=1,
                                     max_num_question_words=None,
                                     max_num_context_words=None)
    voc = corpus.get_vocab()

    encoder = SentenceEncoderSingleContext(model_dir_path=encoder_model,
                                           vocabulary=voc,
                                           spec=spec,
                                           loader=corpus.get_resource_loader())

    for questions, title2doc in [(train, corpus.train_title_to_document),
                                 (dev, corpus.dev_title_to_document)]:
        print(
            f"Starting encoding of {'train' if questions == train else 'dev'}")
        # eliminating distractors not from original squad
        title2max = {
            key: max(x.paragraph.par_id for x in group)
            for key, group in itertools.groupby(
                sorted(questions, key=lambda x: x.paragraph.doc_title),
                key=lambda x: x.paragraph.doc_title)
        }
        for title in title2max:
            title2doc[title].paragraphs = title2doc[
                title].paragraphs[:title2max[title] + 1]

        for title in tqdm(title2max):
            np.savez_compressed(
                get_filename(questions == train, title), **{
                    str(k): v
                    for k, v in encode_document(encoder,
                                                title2doc[title]).items()
                })
Esempio n. 6
0
def squad_build_drqa_doc_encodings(out_dir,
                                   encoder_model,
                                   num_workers,
                                   all_squad=False):
    print("loading data...")
    corpus = SquadRelevanceCorpus()
    questions = corpus.get_dev()
    if all_squad:
        questions.extend(corpus.get_train())
    # docs = corpus.dev_title_to_document.values() if dev else corpus.train_title_to_document.values()
    relevant_titles = list(set([q.paragraph.doc_title for q in questions]))

    conn = sqlite3.connect(DRQA_DOC_DB)
    c = conn.cursor()
    titles = list(set([q.paragraph.doc_title for q in questions]))
    for i, t in enumerate(titles):
        # Had to manually resolve this (due to changes in Wikipedia?)
        if t == "Sky (United Kingdom)":
            titles[i] = "Sky UK"

    title_to_doc_id = {t1: t2 for t1, t2 in zip(titles, relevant_titles)}

    c.execute("CREATE TEMPORARY TABLE squad_docs(id)")
    c.executemany("INSERT INTO squad_docs VALUES (?)", [(x, ) for x in titles])

    c.execute("SELECT id, text FROM documents WHERE id IN squad_docs")

    out = c.fetchall()
    conn.close()

    out = [(title_to_doc_id[title], text) for title, text in out]

    spec = QuestionAndParagraphsSpec(batch_size=None,
                                     max_num_contexts=1,
                                     max_num_question_words=None,
                                     max_num_context_words=None)
    voc = corpus.get_vocab()
    encoder = SentenceEncoderSingleContext(model_dir_path=encoder_model,
                                           vocabulary=voc,
                                           spec=spec,
                                           loader=ResourceLoader())

    # Setup worker pool
    workers = ProcessPool(num_workers, initializer=init, initargs=[])

    documents = {}
    tokenized_documents = {}

    print("Tokenizing...")
    with tqdm(total=len(out)) as pbar:
        for doc, tok_doc in tqdm(
                workers.imap_unordered(get_document_paragraphs, out)):
            documents.update(doc)
            tokenized_documents.update(tok_doc)
            pbar.update()

    encodings = {}
    print("Encoding...")
    for title, paragraphs in tqdm(tokenized_documents.items()):
        dummy_question = "Hello Hello".split()
        model_paragraphs = [
            BinaryQuestionAndParagraphs(question=dummy_question,
                                        paragraphs=[x],
                                        label=1,
                                        num_distractors=0,
                                        question_id='dummy')
            for x in paragraphs
        ]
        encodings.update({
            f"{title}_{i}": rep
            for i, rep in enumerate(encoder.encode_paragraphs(
                model_paragraphs))
        })

    with open(join(out_dir, 'docs.json'), 'w') as f:
        json.dump(documents, f)
    np.savez_compressed(join(out_dir, 'encodings.npz'), **encodings)
Esempio n. 7
0
def build_doc_eval_file(out_file,
                        encodings_dir,
                        encoder_model,
                        k,
                        per_doc=True):
    print("loading data...")
    corpus = SquadRelevanceCorpus()
    questions = corpus.get_dev()
    spec = QuestionAndParagraphsSpec(batch_size=None,
                                     max_num_contexts=1,
                                     max_num_question_words=None,
                                     max_num_context_words=None)
    voc = corpus.get_vocab()
    encoder = SentenceEncoderSingleContext(model_dir_path=encoder_model,
                                           vocabulary=voc,
                                           spec=spec,
                                           loader=corpus.get_resource_loader())

    par_encs = np.load(join(encodings_dir, 'encodings.npz'))
    with open(join(encodings_dir, 'docs.json'), 'r') as f:
        documents = json.load(f)

    questions_eval_format = []
    questions = sorted(questions, key=lambda x: x.paragraph.doc_title)
    if per_doc:
        title2par_encs = {}
        for p_name, rep in par_encs.items():
            title = '_'.join(p_name.split('_')[:-1])
            if title in title2par_encs:
                title2par_encs[title].update({p_name: rep})
            else:
                title2par_encs[title] = {p_name: rep}
        for title, doc_qs in tqdm(
                itertools.groupby(questions,
                                  key=lambda x: x.paragraph.doc_title)):
            doc_qs = list(doc_qs)
            q_encodings = encode_squad.encode_questions(encoder, doc_qs)
            par2ids = {}
            reps = []
            total_sentences = 0
            for p_name, rep in title2par_encs[title].items():
                par2ids[p_name] = list(
                    range(total_sentences, total_sentences + len(rep)))
                reps.append(rep)
                total_sentences += len(rep)
            id2par = {i: p for p, ids in par2ids.items() for i in ids}
            reps = np.concatenate(reps, axis=0)
            top_k = simple_numpy_knn(q_encodings, reps, k * 2)
            for idx, question in enumerate(doc_qs):
                seen = set()
                p_names = [
                    id2par[x] for x in top_k[idx]
                    if not (id2par[x] in seen or seen.add(id2par[x]))
                ][:k]
                questions_eval_format.append({
                    'qid':
                    question.question_id,
                    'question':
                    ' '.join(question.question),
                    'answers':
                    list(question.answers),
                    'paragraphs': [
                        documents['_'.join(p_name.split('_')[:-1])][int(
                            p_name.split('_')[-1])] for p_name in p_names
                    ]
                })
    else:
        print("encoding questions")
        q_encodings = encode_squad.encode_questions(encoder, questions)
        par2ids = {}
        reps = []
        total_sentences = 0
        for p_name, rep in par_encs.items():
            par2ids[p_name] = list(
                range(total_sentences, total_sentences + len(rep)))
            reps.append(rep)
            total_sentences += len(rep)
        id2par = {i: p for p, ids in par2ids.items() for i in ids}
        reps = np.concatenate(reps, axis=0)
        print("scoring")
        top_k = simple_numpy_knn(q_encodings, reps, k * 2)
        for idx, question in enumerate(questions):
            seen = set()
            p_names = [
                id2par[x] for x in top_k[idx]
                if not (id2par[x] in seen or seen.add(id2par[x]))
            ][:k]
            questions_eval_format.append({
                'qid':
                question.question_id,
                'question':
                ' '.join(question.question),
                'answers':
                list(question.answers),
                'paragraphs': [
                    documents['_'.join(p_name.split('_')[:-1])][int(
                        p_name.split('_')[-1])] for p_name in p_names
                ]
            })

    with open(out_file, 'w') as f:
        json.dump(questions_eval_format, f)
Esempio n. 8
0
def main():
    parser = argparse.ArgumentParser(description='Train a model on the Squad relevance dataset')
    parser.add_argument("name", help="Where to store the model")
    parser.add_argument("--elmo", action='store_true', help="Whether to use elmo or not")
    parser.add_argument("-c", "--continue_model", action='store_true', help="Whether to start a new run or "
                                                                            "continue an existing one")
    args = parser.parse_args()

    with open(__file__, "r") as f:
        notes = f.read()

    continue_existing_run = args.continue_model
    # save_preprocessed = args.save
    if continue_existing_run:
        print("We will continue an existing run!")
    else:
        print("We will start a new run!")

    out = args.name + "-" + datetime.now().strftime("%m%d-%H%M%S")
    if continue_existing_run:
        out = args.name

    # model = get_basic_model(500, post_merger_params=None, use_elmo=args.elmo, keep_rate=0.8)
    # model = get_context_to_question_model(rnn_dim=150, q2c=False, res_rnn=True, res_self_att=False)
    # model = get_context_with_bottleneck_to_question_model(rnn_dim=500, q2c=False, res_rnn=True, res_self_att=False)
    # model = get_ablate_model()
    # model = get_fixed_context_to_question(150)
    # model = get_bottleneck_to_seq_model(500, q2c=False, res_rnn=True, res_self_att=False, seq_len=50)
    # model = get_multi_encode_model(0, 200, num_encodings=5, map_embed=False)
    # model = get_multi_encode_softmax_weighting_model(0, 400, num_encodings=5, map_embed=False)
    model = get_sentences_model(512, use_elmo=args.elmo, keep_rate=0.8)

    corpus = SquadRelevanceCorpus()
    train_batcher = ClusteredBatcher(45, multiple_contexts_len, truncate_batches=True)
    dev_batcher = ClusteredBatcher(128, multiple_contexts_len, truncate_batches=True)
    data = SquadBinaryRelevanceTrainingData(corpus=corpus, train_batcher=train_batcher, dev_batcher=dev_batcher,
                                            sample_filter=None, preprocessor=SquadTextLengthPreprocessor(600),
                                            sample_train=None, sample_dev=None, sample_seed=18)

    eval = [LossEvaluator(), BinaryClassificationEvaluator()]

    n_epochs = 80

    adadelta = SerializableOptimizer("Adadelta", dict(learning_rate=1.0))
    momentum = SerializableOptimizer("Momentum", dict(learning_rate=0.01, momentum=0.9, use_nesterov=True))
    adam = SerializableOptimizer("Adam", dict(learning_rate=1e-4))

    reduce_lr_on_plateau = ReduceLROnPlateau(dev_name='dev', scalar_name='loss', factor=0.2,
                                             patience=8, verbose=1, mode='min', terminate_th=1e-5)

    params = TrainParams(
        adadelta,
        num_epochs=n_epochs, ema=0.999, max_checkpoints_to_keep=2,
        async_encoding=8, log_period=30, eval_period=1800, save_period=1800,
        eval_samples=dict(dev=None, train=3000), best_weights=('dev', 'binary-relevance/average_precision'),
        monitor_gradients=True, clip_norm=None, regularization_lambda=None, reduce_lr_on_plateau=None
    )

    if not continue_existing_run:
        trainer.start_training(data, model, params, eval, model_dir.ModelDir(out), notes, save_graph=False)
    else:
        resume_training_with(data=data, out=model_dir.ModelDir(out),
                             train_params=params, evaluators=eval, notes=notes, start_eval=True)
Esempio n. 9
0
def main():
    parser = argparse.ArgumentParser(description='Evaluate a model on SQuAD')
    parser.add_argument('model', help='model directory to evaluate')
    parser.add_argument('-n', '--sample_questions', type=int, default=None,
                        help="(for testing) run on a subset of questions")
    parser.add_argument('-b', '--batch_size', type=int, default=64,
                        help="Batch size, larger sizes can be faster but uses more memory")
    parser.add_argument('-s', '--step', default=None,
                        help="Weights to load, can be a checkpoint step or 'latest'")
    # parser.add_argument('-c', '--corpus', choices=["dev", "train"], default="dev")
    parser.add_argument('--no-ema', action="store_true", help="Don't use EMA weights even if they exist")
    parser.add_argument('--per-doc', action='store_true', help="Whether to test only against full doc, or against "
                                                               "distractors.")
    parser.add_argument('--save-errors', default=None, type=str)
    args = parser.parse_args()

    model_dir = ModelDir(args.model)

    corpus = SquadRelevanceCorpus()
    # if args.corpus == "dev":
    #     questions = corpus.get_dev()
    # else:
    #     questions = corpus.get_train()
    questions = corpus.get_dev()

    question_preprocessor = SquadTextLengthPreprocessor(600)
    questions = [question_preprocessor.preprocess(x) for x in questions
                 if (question_preprocessor.preprocess(x) is not None)]

    if args.sample_questions:
        questions = sorted(questions, key=lambda x: x.question_id)
        np.random.RandomState(0).shuffle(questions)
        questions = questions[:args.sample_questions]

    batcher = ClusteredBatcher(args.batch_size, multiple_contexts_len, truncate_batches=True)
    if args.per_doc:
        data = SquadFullDocumentDataset(questions, batcher, corpus.dev_title_to_document)
    else:
        data = SquadFullQuestionParagraphPairsDataset(questions, batcher)

    evaluators = [BinaryClassificationEvaluator(), RecordFineGrainedBinaryPrediction(), RecordFullRankings()]

    if args.step is not None:
        if args.step == "latest":
            checkpoint = model_dir.get_latest_checkpoint()
        else:
            checkpoint = model_dir.get_checkpoint(int(args.step))
    else:
        checkpoint = model_dir.get_best_weights()
        if checkpoint is not None:
            print("Using best weights")
        else:
            print("Using latest checkpoint")
            checkpoint = model_dir.get_latest_checkpoint()

    model = model_dir.get_model()

    evaluation = trainer.test(model, evaluators, {'dev_full': data},
                              corpus.get_resource_loader(), checkpoint, not args.no_ema, 10)['dev_full']

    if args.save_errors is not None:
        errors_dict = evaluation.per_sample['per_question_errors']

        def format_error(wrong: Tuple[BinaryQuestionAndParagraphs, float],
                         correct: Tuple[BinaryQuestionAndParagraphs, float]):
            question = ' '.join(wrong[0].question)
            qid = wrong[0].question_id
            wrong_text = ' '.join(wrong[0].paragraphs[0])
            wrong_score = wrong[1]
            correct_text = ' '.join(correct[0].paragraphs[0])
            correct_score = correct[1]
            return f"Question: {question}, ID: {qid}\n" \
                   f"Incorrect First Place: (score: {wrong_score})\n{wrong_text}\n" \
                   f"Correct Passage: (score: {correct_score})\n{correct_text}\n"

        with open(args.save_errors, 'wt') as f:
            for false_par, true_par in errors_dict.values():
                f.write(format_error(false_par, true_par))

    # Print the scalar results in a two column table
    scalars = evaluation.scalars
    cols = list(sorted(scalars.keys()))
    table = [cols]
    header = ["Metric", ""]
    table.append([("%s" % scalars[x] if x in scalars else "-") for x in cols])
    print_table([header] + transpose_lists(table))