Exemple #1
0
def build_web_corpus(n_processes, sets_to_build, source_dir, target_dir):
    sets_to_build_dict = {}
    if 'verified' in sets_to_build:
        sets_to_build_dict['verified'] = join(source_dir,
                                              "verified-web-dev.json")
    if 'dev' in sets_to_build:
        sets_to_build_dict['dev'] = join(source_dir, "web-dev.json")
    if 'train' in sets_to_build:
        sets_to_build_dict['train'] = join(source_dir, "web-train.json")
    if 'test' in sets_to_build:
        sets_to_build_dict['test'] = join(source_dir,
                                          "web-test-without-answers.json")

    #dict(
    #    verified=join(TRIVIA_QA, "qa", "verified-web-dev.json"),
    #    dev=join(TRIVIA_QA, "qa", "web-dev.json"),
    #    train=join(TRIVIA_QA, "qa", "web-train.json"),
    #    test=join(TRIVIA_QA, "qa", "web-test-without-answers.json")
    #)

    build_dataset("web",
                  NltkAndPunctTokenizer(),
                  sets_to_build_dict,
                  FastNormalizedAnswerDetector(),
                  n_processes,
                  out_dir=target_dir)
Exemple #2
0
def build_web_corpus(n_processes):
    build_dataset(
        "web", NltkAndPunctTokenizer(),
        dict(verified=join(TRIVIA_QA, "qa", "verified-web-dev.json"),
             dev=join(TRIVIA_QA, "qa", "web-dev.json"),
             train=join(TRIVIA_QA, "qa", "web-train.json"),
             test=join(TRIVIA_QA, "qa", "web-test-without-answers.json")),
        FastNormalizedAnswerDetector(), n_processes)
Exemple #3
0
def build_wiki_corpus(n_processes):
    build_dataset(
        "wiki", NltkAndPunctTokenizer(),
        dict(
            verified=join(TRIVIA_QA, "qa", "verified-wikipedia-dev.json"),
            dev=join(TRIVIA_QA, "qa", "wikipedia-dev.json"),
            train=join(TRIVIA_QA, "qa", "wikipedia-train.json"),
        ), FastNormalizedAnswerDetector(), n_processes)
Exemple #4
0
def build_sample_corpus(n_processes):
    build_dataset("web-sample",
                  NltkAndPunctTokenizer(),
                  dict(
                      dev=join(TRIVIA_QA, "qa", "web-dev.json"),
                      train=join(TRIVIA_QA, "qa", "web-train.json"),
                  ),
                  FastNormalizedAnswerDetector(),
                  n_processes,
                  sample=1000)
Exemple #5
0
def build_unfiltered_corpus(n_processes):
    build_dataset("web-open",
                  NltkAndPunctTokenizer(),
                  dict(dev=join(TRIVIA_QA_UNFILTERED,
                                "unfiltered-web-dev.json"),
                       train=join(TRIVIA_QA_UNFILTERED,
                                  "unfiltered-web-train.json"),
                       test=join(TRIVIA_QA_UNFILTERED,
                                 "unfiltered-web-test-without-answers.json")),
                  answer_detector=FastNormalizedAnswerDetector(),
                  n_process=n_processes)
Exemple #6
0
def build_unfiltered_corpus(n_processes, sets_to_build, source_dir,
                            target_dir):
    sets_to_build_dict = {}
    if 'dev' in sets_to_build:
        sets_to_build_dict['dev'] = join(source_dir, "unfiltered-web-dev.json")
    if 'train' in sets_to_build:
        sets_to_build_dict['train'] = join(source_dir,
                                           "unfiltered-web-train.json")
    if 'test' in sets_to_build:
        sets_to_build_dict['test'] = join(
            source_dir, "unfiltered-web-test-without-answers.json")

    build_dataset("web-open",
                  NltkAndPunctTokenizer(),
                  sets_to_build_dict,
                  answer_detector=FastNormalizedAnswerDetector(),
                  n_process=n_processes,
                  out_dir=target_dir)