def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("output")
    parser.add_argument("-m", "--min_count", type=int, default=1)
    parser.add_argument("-n", "--n_processes", type=int, default=1)
    args = parser.parse_args()

    if exists(args.output):
        raise ValueError()

    data = TriviaQaOpenDataset()
    corpus_voc = get_evidence_voc(data.evidence, args.n_processes)

    print("Adding question voc...")
    train = data.get_train()
    for q in train:
        corpus_voc.update(q.question)

    print("Saving...")
    with open(args.output, "w") as f:
        for word, c in corpus_voc.items():
            if c >= args.min_count:
                f.write(word)
                f.write("\n")
Ejemplo n.º 2
0
all_train_questions = []
all_dev_questions = []
all_filemaps = {}
for ind, dataset_name in enumerate(args.datasets.split(',')):
    print('loading ' + dataset_name)
    source_dir = join(CORPUS_DIR, "triviaqa", "web-open", dataset_name)

    dataset = TriviaQaOpenDataset(source_dir)
    # just loading the pkl that was saved in build_span_corpus
    if args.sample_first == 1.0 or ind == 0:
        all_dev_questions += dataset.get_dev()

    num_of_contexts = (pd.Series(args.datasets.replace('-G','').replace('-O','').split(',')) == \
                       dataset_name.replace('-G','').replace('-O','')).sum()

    train = dataset.get_train()

    # Filtering cases with no answer:
    train_with_ans = []
    for question in train:
        if pd.Series([len(doc.answer_spans)
                      for doc in question.all_docs]).sum() > 0:
            train_with_ans.append(question)

    print("number of question with answer is %d" % (len(train_with_ans)))

    # sample_first assumes the first dataset in the list is our target dataset, to ablate we may whish
    # to take only a sample of it for training. sample_first is between (0,1]
    if args.sample_first <= 1.0 and ind == 0:
        all_train_questions += list(
            pd.Series(train_with_ans).sample(frac=args.sample_first))