Esempio n. 1
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'wmt')
    version = 'None'

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.

        train_r_fnames = ('train.en', 'train.de')
        train_w_fname = 'en_de_train.txt'
        valid_w_fname = 'en_de_valid.txt'
        test_r_fnames = ('newstest2014.en', 'newstest2014.de')
        test_w_fname = 'en_de_test.txt'

        train_zip = readFiles(dpath, train_r_fnames)
        numpy.random.shuffle(train_zip)
        with open(os.path.join(dpath, valid_w_fname), 'w') as f:
            for de_sent, en_sent in train_zip[:30000]:
                f.write('1 ' + en_sent + '\t' + de_sent + '\n')
        with open(os.path.join(dpath, train_w_fname), 'w') as f:
            for de_sent, en_sent in train_zip[30000:]:
                f.write('1 ' + en_sent + '\t' + de_sent + '\n')

        test_zip = readFiles(dpath, test_r_fnames)
        with open(os.path.join(dpath, test_w_fname), 'w') as f:
            for de_sent, en_sent in test_zip:
                f.write('1 ' + en_sent + '\t' + de_sent + '\n')

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
Esempio n. 2
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'COCO_2014_Caption')
    version = '1.0'

    # check if data had been previously built
    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')

        # make a clean directory if needed
        if build_data.built(dpath):
            # an older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # download the data.
        fname = 'dataset_coco.tgz'
        # dataset URL
        url = 'http://parl.ai/downloads/coco_caption/'

        build_data.download(url + fname, dpath, fname)

        # uncompress it
        build_data.untar(dpath, fname)

        # mark the data as built
        build_data.mark_done(dpath, version_string=version)
Esempio n. 3
0
def build(opt):
    version = 'v1.0'
    dpath = os.path.join(opt['datapath'], 'Twitter')

    if not build_data.built(dpath, version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname1 = "twitter_en_big.txt.gz.partaa"
        fname2 = "twitter_en_big.txt.gz.partab"
        url = 'https://github.com/Marsan-Ma/chat_corpus/raw/master/'
        build_data.download(url + fname1, dpath, fname1)
        build_data.download(url + fname2, dpath, fname2)

        file1 = os.path.join(dpath, fname1)
        file2 = os.path.join(dpath, fname2)
        file3 = "twitter_en_big.txt.gz"
        outzipfile= os.path.join(dpath, file3)
        build_data.cat(file1, file2, outzipfile)

        import gzip
        with gzip.open(outzipfile, 'r') as f:
            file_content = bytes.decode(f.read())
        data = file_content.split('\n')[2:]
        create_fb_format(data, dpath)
        os.remove(outzipfile)

        # Mark the data as built.
        build_data.mark_done(dpath, version)
Esempio n. 4
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'NaturalQuestionsOpen')
    version = str(VERSION)

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        for downloadable_file in RESOURCES:
            downloadable_file.download_file(dpath)
            if ".gz" in downloadable_file.file_name:
                with gzip.open(
                    os.path.join(dpath, downloadable_file.file_name), 'rb'
                ) as fin:
                    with open(
                        os.path.join(dpath, downloadable_file.file_name[:-3]), 'wb'
                    ) as fout:
                        shutil.copyfileobj(fin, fout)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
Esempio n. 5
0
def build(opt):
    """Prepares datasets and other dependencies for NERTeacher"""
    version = '1.1'
    dpath = os.path.join(opt['datapath'], 'ner')

    # check if data had been previously built
    raw_path = os.path.abspath(opt['raw_dataset_path'] or ".")
    if len([f for f in os.listdir(raw_path) if f.endswith(".iob")]) == 0:
        if not build_data.built(dpath, version_string=version):
            print('[target data path: ' + dpath + ']')
            # make a clean directory if needed
            if build_data.built(dpath):
                # an older version exists, so remove these outdated files.
                build_data.remove_dir(dpath)
            build_data.make_dir(dpath)

            ds_path = os.environ.get('DATASETS_URL')
            file_name = 'gareev.tar.gz'
            if not ds_path:
                raise RuntimeError(
                    "Looks like the `DATASETS_URL` variable is set incorrectly"
                )
            print('Trying to download a dataset %s from the repository' %
                  file_name)
            url = urllib.parse.urljoin(ds_path, file_name)
            build_data.download(url, dpath, file_name)
            build_data.untar(dpath, file_name)
            print('Downloaded a %s dataset' % file_name)
            # mark the data as built
            build_data.mark_done(dpath, version_string=version)
        opt['raw_dataset_path'] = dpath
    print("Use dataset from path: %s" % repr(opt['raw_dataset_path']))
    create_heap_file(opt['raw_dataset_path'])
def build(datapath, use_history):
    dpath = os.path.join(datapath, 'OpenSubtitles2018')
    if not use_history:
        dpath += '_no_history'
    version = '1'

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        untar_path = os.path.join(dpath, 'OpenSubtitles', 'xml', 'en')

        if len(glob.glob(untar_path + '/*/*/*.xml')) != NUM_SUBTITLES_FILES:
            # Download the data.
            url = 'https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/xml/en.zip'
            build_data.download(url, dpath, 'OpenSubtitles2018.zip')
            build_data.untar(dpath, 'OpenSubtitles2018.zip')

        create_fb_format(untar_path, dpath, use_history)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
    return dpath
Esempio n. 7
0
def build(opt, subtask=None):
    # get path to data directory
    dpath = os.path.join(opt['datapath'], 'Reddit', subtask)

    # check if data had been previously built
    if not build_data.built(dpath, version_string=subtask):
        print('[building data: ' + dpath + ']')

        # make a clean directory if needed
        if build_data.built(dpath):
            # an older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # don't download the data.
        fname = os.environ['HOME'] + '/data/anime.pickle'
        if subtask:
            fname = os.environ['HOME'] + '/data/' + subtask + '.pickle'
        data = pickle.load(open(fname, 'rb'))

        # create_fb_format(data, dpath, subtask)
        create_fb_format_by_link(data, dpath, subtask)

        # mark the data as built
        build_data.mark_done(dpath, version_string=subtask)
Esempio n. 8
0
def build(opt):
    version = 'v1.1'
    dpath = os.path.join(opt['datapath'], 'Twitter')

    if not build_data.built(dpath, version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        for downloadable_file in RESOURCES:
            downloadable_file.download_file(dpath)

        file1 = os.path.join(dpath, RESOURCES[0].file_name)
        file2 = os.path.join(dpath, RESOURCES[1].file_name)

        concat = io.BytesIO()

        for fn in [file1, file2]:
            with PathManager.open(fn, 'rb') as rawf:
                concat.write(rawf.read())

        with gzip.GzipFile(fileobj=io.BytesIO(concat.getvalue())) as f:
            file_content = bytes.decode(f.read())
            data = file_content.split('\n')[2:]

        create_fb_format(data, dpath)

        PathManager.rm(file1)
        PathManager.rm(file2)

        # Mark the data as built.
        build_data.mark_done(dpath, version)
def build(opt):
    version = '0.2'
    dpath = os.path.join(opt['datapath'], 'ConvAI2_wild_evaluation')

    if not build_data.built(dpath, version):
        print('[building data: ' + dpath + ']')

        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        for downloadable_file in RESOURCES:
            downloadable_file.download_file(dpath)

        output_fname = 'convai2_wild_evaluation.json'
        output_path = os.path.join(dpath, output_fname)

        with open(output_path, 'r') as data_f:
            data = json.load(data_f)

        make_parlai_format(data, dpath)
        os.remove(output_path)

        # Mark the data as built.
        build_data.mark_done(dpath, version)
Esempio n. 10
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'MCTest')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'mctest.tar.gz'
        url = 'http://parl.ai/downloads/mctest/' + fname
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        dpext = os.path.join(dpath, 'mctest')
        create_fb_format(dpath, 'train160',
                         os.path.join(dpext, 'MCTest', 'mc160.train'), None)
        create_fb_format(dpath, 'valid160',
                         os.path.join(dpext, 'MCTest', 'mc160.dev'), None)
        create_fb_format(dpath, 'test160',
                         os.path.join(dpext, 'MCTest', 'mc160.test'),
                         os.path.join(dpext, 'MCTestAnswers', 'mc160.test.ans'))
        create_fb_format(dpath, 'train500',
                         os.path.join(dpext, 'MCTest', 'mc500.train'), None)
        create_fb_format(dpath, 'valid500',
                         os.path.join(dpext, 'MCTest', 'mc500.dev'), None)
        create_fb_format(dpath, 'test500',
                         os.path.join(dpext, 'MCTest', 'mc500.test'),
                         os.path.join(dpext, 'MCTestAnswers', 'mc500.test.ans'))

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
Esempio n. 11
0
def build(opt):
    dpath = opt['datapath'] + "/VQA-COCO2014/"

    if not build_data.built(dpath):
        print("[building data: " + dpath + "]")
        build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname1 = "Questions_Train_mscoco.zip"
        fname2 = "Questions_Val_mscoco.zip"
        fname3 = "Questions_Test_mscoco.zip"

        fname4 = "Annotations_Val_mscoco.zip"
        fname5 = "Annotations_Train_mscoco.zip"

        url = "http://visualqa.org/data/mscoco/vqa/"
        build_data.download(dpath, url + fname1)
        build_data.download(dpath, url + fname2)
        build_data.download(dpath, url + fname3)

        build_data.download(dpath, url + fname4)
        build_data.download(dpath, url + fname5)

        build_data.untar(dpath, fname1)
        build_data.untar(dpath, fname2)
        build_data.untar(dpath, fname3)
        build_data.untar(dpath, fname4)
        build_data.untar(dpath, fname5)

        buildImage(dpath)

        # Mark the data as built.
        build_data.mark_done(dpath)
Esempio n. 12
0
def build(opt):
    dpath, version = download(opt)
    if 'light_use_speech_prefix' not in opt:
        opt['light_use_speech_prefix'] = True
    # create particular instance of dataset depending on flags..
    fields = [
        'taskname',
        'setting',
        'objects',
        'person_names',
        'persona',
        'emote',
        'speech',
        'action',
        'affordances',
        'repeat',
        'cands',
        'current_self_output',
        'clip_cands',
        'speech_prefix',
    ]
    fpath = ''
    for f in fields:
        fpath += f + str(opt['light_use_' + f]) + "_"
    dpath2 = os.path.join(opt['datapath'], 'light_dialogue', fpath[:-1])
    if not build_data.built(dpath2, version):
        if build_data.built(dpath2):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath2)
        build_data.make_dir(dpath2)
        fname = 'light_data.pkl'
        fname2 = 'light_unseen_data.pkl'
        build_from_db(opt, dpath, dpath2, fname, fname2)
        # Mark the data as built.
        build_data.mark_done(dpath2, version)
Esempio n. 13
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'VQA-v2')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        # An older version exists, so remove these outdated files.
        if build_data.built(dpath):
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname1 = 'v2_Questions_Train_mscoco.zip'
        fname2 = 'v2_Questions_Val_mscoco.zip'
        fname3 = 'v2_Questions_Test_mscoco.zip'

        fname4 = 'v2_Annotations_Val_mscoco.zip'
        fname5 = 'v2_Annotations_Train_mscoco.zip'

        url = 'http://visualqa.org/data/mscoco/vqa/'
        build_data.download(url + fname1, dpath, fname1)
        build_data.download(url + fname2, dpath, fname2)
        build_data.download(url + fname3, dpath, fname3)

        build_data.download(url + fname4, dpath, fname4)
        build_data.download(url + fname5, dpath, fname5)

        build_data.untar(dpath, fname1)
        build_data.untar(dpath, fname2)
        build_data.untar(dpath, fname3)
        build_data.untar(dpath, fname4)
        build_data.untar(dpath, fname5)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
Esempio n. 14
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'HotpotQA')

    if not build_data.built(dpath, version_string=VERSION):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        build_data.download(URL + TRAIN_FILENAME, dpath, TRAIN_FILENAME)
        build_data.download(URL + DEV_DISTRACTOR_FILENAME, dpath,
                            DEV_DISTRACTOR_FILENAME)
        build_data.download(URL + DEV_FULLWIKI_FILENAME, dpath,
                            DEV_FULLWIKI_FILENAME)

        with open(os.path.join(dpath, TRAIN_FILENAME)) as f:
            data = json.load(f)
            make_parlai_format(dpath, 'train', data)

        with open(os.path.join(dpath, DEV_DISTRACTOR_FILENAME)) as f:
            data = json.load(f)
            make_parlai_format(dpath, 'valid_distractor', data)

        with open(os.path.join(dpath, DEV_FULLWIKI_FILENAME)) as f:
            data = json.load(f)
            make_parlai_format(dpath, 'valid_fullwiki', data)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=VERSION)
Esempio n. 15
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'MS_MARCO')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data
        url = "https://msmarco.blob.core.windows.net/msmarco/"

        fname = "train_v1.1.json.gz"
        build_data.download(url + fname, dpath, 'train.gz')

        fname = "dev_v1.1.json.gz"
        build_data.download(url + fname, dpath, 'valid.gz')

        fname = "test_public_v1.1.json.gz"
        build_data.download(url + fname, dpath, 'test.gz')

        create_fb_format(dpath, "train", os.path.join(dpath, 'train.gz'))
        create_fb_format(dpath, "valid", os.path.join(dpath, 'valid.gz'))
        create_fb_format(dpath, "test", os.path.join(dpath, 'test.gz'))

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
Esempio n. 16
0
def build(opt):
    version = 'v1.0'
    dpath = os.path.join(opt['datapath'], 'SCAN')

    if not build_data.built(dpath, version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'scan.tgz'
        url = 'http://parl.ai/downloads/scan/' + fname
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        create_fb_format(dpath, 'train',
                         os.path.join(dpath, 'tasks_train_simple.txt'))
        create_fb_format(dpath, 'valid',
                         os.path.join(dpath, 'tasks_train_simple.txt'))
        create_fb_format(dpath, 'test',
                         os.path.join(dpath, 'tasks_test_simple.txt'))

        # Mark the data as built.
        build_data.mark_done(dpath, version)
Esempio n. 17
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'CornellMovie')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'cornell_movie_dialogs_corpus.tgz'
        url = 'http://parl.ai/downloads/cornell_movie/' + fname
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        dpext = os.path.join(dpath, 'cornell movie-dialogs corpus')
        create_fb_format(
            os.path.join(dpext, 'movie_lines.txt'),
            os.path.join(dpext, 'movie_conversations.txt'),
            dpath,
        )

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
Esempio n. 18
0
def build(opt):
    # get path to data directory
    dpath = os.path.join(opt['datapath'], 'google_sgd')
    # define version if any
    version = "1.0"

    # check if data had been previously built
    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')

        # make a clean directory if needed
        if build_data.built(dpath):
            # an older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        for split_type in ['train', 'dev', 'test']:
            outpath = os.path.join(dpath, split_type)
            filename = 'schema.json'
            url = f'{ROOT_URL}/{split_type}/{filename}'

            build_data.make_dir(outpath)
            build_data.download(url, outpath, filename)
            for file_id in range(1, DATA_LEN[split_type] + 1):
                filename = f'dialogues_{file_id:03d}.json'
                url = f'{ROOT_URL}/{split_type}/{filename}'
                build_data.download(url, outpath, filename)

        # mark the data as built
        build_data.mark_done(dpath, version_string=version)
Esempio n. 19
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'QuAC')
    version = VERSION

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        for downloadable_file in RESOURCES:
            downloadable_file.download_file(dpath)

        with PathManager.open(os.path.join(dpath,
                                           RESOURCES[0].file_name)) as f:
            data = json.load(f)['data']
            make_parlai_format(dpath, 'train', data)

        with PathManager.open(os.path.join(dpath,
                                           RESOURCES[1].file_name)) as f:
            data = json.load(f)['data']
            make_parlai_format(dpath, 'valid', data)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
Esempio n. 20
0
def build(opt):
    # get path to data directory
    dpath = os.path.join(opt['datapath'], 'taskmaster-1')
    # define version if any
    version = "1.0"

    # check if data had been previously built
    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')

        # make a clean directory if needed
        if build_data.built(dpath):
            # an older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # download the data.
        gsl_url = 'https://storage.googleapis.com/dialog-data-corpus/TASKMASTER-1-2019/'
        fname_self_dialogs = 'self-dialogs.json'
        fname_woz_dialogs = 'woz-dialogs.json'
        url_self_dialogs = gsl_url + fname_self_dialogs  # dataset URL
        url_woz_dialogs = gsl_url + fname_woz_dialogs  # dataset URL
        build_data.download(url_self_dialogs, dpath, fname_self_dialogs)
        build_data.download(url_woz_dialogs, dpath, fname_woz_dialogs)

        # mark the data as built
        build_data.mark_done(dpath, version_string=version)
Esempio n. 21
0
def buildImage(opt):
    dpath = os.path.join(opt['datapath'], 'COCO-IMG')

    if not build_data.built(dpath):
        print('[building image data: ' + dpath + ']')
        build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # download the image data.
        fname1 = 'train2014.zip'
        fname2 = 'val2014.zip'
        fname3 = 'test2014.zip'

        url = 'http://msvocds.blob.core.windows.net/coco2014/'

        build_data.download(url + fname1, dpath, fname1)
        build_data.download(url + fname2, dpath, fname2)
        build_data.download(url + fname3, dpath, fname3)

        build_data.untar(dpath, fname1)
        build_data.untar(dpath, fname2)
        build_data.untar(dpath, fname3)

        # Mark the data as built.
        build_data.mark_done(dpath)
Esempio n. 22
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'MS_MARCO')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data
        url = "https://msmarco.blob.core.windows.net/msmarco/"

        fname = "train_v1.1.json.gz"
        build_data.download(url + fname, dpath, 'train.gz')

        fname = "dev_v1.1.json.gz"
        build_data.download(url + fname, dpath, 'valid.gz')

        fname = "test_public_v1.1.json.gz"
        build_data.download(url + fname, dpath, 'test.gz')

        create_fb_format(dpath, "train", os.path.join(dpath, 'train.gz'))
        create_fb_format(dpath, "valid", os.path.join(dpath, 'valid.gz'))
        create_fb_format(dpath, "test", os.path.join(dpath, 'test.gz'))

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
Esempio n. 23
0
def build(datapath, use_history):
    dpath = os.path.join(datapath, 'OpenSubtitles2018')
    if not use_history:
        dpath += '_no_history'
    version = '1'

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        untar_path = os.path.join(dpath, 'OpenSubtitles2018', 'xml', 'en')

        if len(glob.glob(untar_path + '/*/*/*.xml.gz')) != NUM_SUBTITLES_FILES:
            # Download the data.
            url = (
                'http://opus.lingfil.uu.se/download.php?f=OpenSubtitles2018/en.tar.gz'
            )
            build_data.download(url, dpath, 'OpenSubtitles2018.tar.gz')
            build_data.untar(dpath, 'OpenSubtitles2018.tar.gz')

        create_fb_format(untar_path, dpath, use_history)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
    return dpath
Esempio n. 24
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'COCO_2015_Caption')
    version = None

    # check if data had been previously built
    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')

        # make a clean directory if needed
        if build_data.built(dpath):
            # an older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # download the data.

        fname = 'image_info_test2015.zip'
        # dataset URL
        url = 'http://images.cocodataset.org/annotations/'

        build_data.download(url + fname, dpath, fname)

        # uncompress it
        build_data.untar(dpath, fname)

        # mark the data as built
        build_data.mark_done(dpath, version_string=version)
Esempio n. 25
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'WikiQA')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'wikiqa.tar.gz'
        url = 'https://s3.amazonaws.com/fair-data/parlai/wikiqa/' + fname
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        dpext = os.path.join(dpath, 'WikiQACorpus')
        create_fb_format(dpath, 'train', os.path.join(dpext,
                                                      'WikiQA-train.tsv'))
        create_fb_format(dpath, 'valid', os.path.join(dpext, 'WikiQA-dev.tsv'))
        create_fb_format(dpath, 'test', os.path.join(dpext, 'WikiQA-test.tsv'))
        create_fb_format(dpath, 'train-filtered',
                         os.path.join(dpext, 'WikiQA-train.tsv'))
        create_fb_format(dpath, 'valid-filtered',
                         os.path.join(dpext, 'WikiQA-dev.tsv'))
        create_fb_format(dpath, 'test-filtered',
                         os.path.join(dpext, 'WikiQA-test.tsv'))

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
Esempio n. 26
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'wmt')
    version = 'None'

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.

        train_r_fnames = ('train.en', 'train.de')
        train_w_fname = 'en_de_train.txt'
        valid_w_fname = 'en_de_valid.txt'
        test_r_fnames = ('newstest2014.en', 'newstest2014.de')
        test_w_fname = 'en_de_test.txt'

        train_zip = readFiles(dpath, train_r_fnames)
        numpy.random.shuffle(train_zip)
        with open(os.path.join(dpath, valid_w_fname), 'w') as f:
            for de_sent, en_sent in train_zip[:30000]:
                f.write("1 "+en_sent+"\t"+de_sent+"\n")
        with open(os.path.join(dpath, train_w_fname), 'w') as f:
            for de_sent, en_sent in train_zip[30000:]:
                f.write("1 "+en_sent+"\t"+de_sent+"\n")

        test_zip = readFiles(dpath, test_r_fnames)
        with open(os.path.join(dpath, test_w_fname), 'w') as f:
            for de_sent, en_sent in test_zip:
                f.write("1 "+en_sent+"\t"+de_sent+"\n")

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
Esempio n. 27
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'wmt')
    version = 'None'

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fnames = [('train.en','train.de', 'en_de_train.txt'),
        ('newstest2014.en','newstest2014.de', 'en_de_test.txt')]
        for (en_fname, de_fname, w_fname) in fnames:
            url_base = 'https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/'
            en_url = url_base + en_fname
            de_url = url_base + de_fname
            build_data.download(en_url, dpath, en_fname)
            build_data.download(de_url, dpath, de_fname)
            with open(os.path.join(dpath, en_fname), 'r') as f:
                en = [l[:-1] for l in f]

            with open(os.path.join(dpath, de_fname), 'r') as f:
                de = [l[:-1] for l in f]

            with open(os.path.join(dpath, w_fname), 'w') as f:
              for de_sent,en_sent in zip(de,en):
                f.write("1 "+en_sent+"\t"+de_sent+"\n")


        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
Esempio n. 28
0
def buildImage(opt):
    dpath = os.path.join(opt['datapath'], 'COCO-IMG-2014')
    version = '1'

    if not build_data.built(dpath, version_string=version):
        print('[building image data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the image data.
        fname1 = 'train2014.zip'
        fname2 = 'val2014.zip'
        fname3 = 'test2014.zip'

        url = 'http://parl.ai/downloads/COCO-IMG/'

        build_data.download(url + fname1, dpath, fname1)
        build_data.download(url + fname2, dpath, fname2)
        build_data.download(url + fname3, dpath, fname3)

        build_data.untar(dpath, fname1)
        build_data.untar(dpath, fname2)
        build_data.untar(dpath, fname3)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
Esempio n. 29
0
def buildImage(opt):
    dpath = os.path.join(opt['datapath'], 'COCO-IMG')
    version = '1'

    if not build_data.built(dpath, version_string=version):
        print('[building image data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the image data.
        fname1 = 'train2014.zip'
        fname2 = 'val2014.zip'
        fname3 = 'test2015.zip'

        url = 'https://s3.amazonaws.com/fair-data/parlai/COCO-IMG/'

        build_data.download(url + fname1, dpath, fname1)
        build_data.download(url + fname2, dpath, fname2)
        build_data.download(url + fname3, dpath, fname3)

        build_data.untar(dpath, fname1)
        build_data.untar(dpath, fname2)
        build_data.untar(dpath, fname3)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
Esempio n. 30
0
def build(datapath, use_history):
    dpath = os.path.join(datapath, 'OpenSubtitles2018')
    if not use_history:
        dpath += '_no_history'
    version = '1'

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        untar_path = os.path.join(dpath, 'OpenSubtitles', 'xml', 'it')

        if len(glob.glob(untar_path + '/*/*/*.xml')) != NUM_SUBTITLES_FILES:
            # Download the data.
            for downloadable_file in RESOURCES:
                downloadable_file.download_file(dpath)

        create_fb_format(untar_path, dpath, use_history)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
    return dpath
Esempio n. 31
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'WebQuestions')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        url = ('https://worksheets.codalab.org/rest/bundles/' +
               '0x4a763f8cde224c2da592b75f29e2f5c2/contents/blob/')
        build_data.download(url, dpath, 'train.json')

        url = ('https://worksheets.codalab.org/rest/bundles/' +
               '0xe7bac352fce7448c9ef238fb0a297ec2/contents/blob/')
        build_data.download(url, dpath, 'test.json')

        create_fb_format(dpath, 'train', os.path.join(dpath, 'train.json'))
        create_fb_format(dpath, 'valid', os.path.join(dpath, 'train.json'))
        create_fb_format(dpath, 'test', os.path.join(dpath, 'test.json'))

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
Esempio n. 32
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'VQA-v1')

    if not build_data.built(dpath):
        print('[building data: ' + dpath + ']')
        for item in os.listdir(dpath):
            item = os.path.join(dpath, item)
            if os.path.isdir(item):
                build_data.remove_dir(item)
        build_data.make_dir(dpath)

        # Download the data.
        fname1 = 'Questions_Train_mscoco.zip'
        fname2 = 'Questions_Val_mscoco.zip'
        fname3 = 'Questions_Test_mscoco.zip'

        fname4 = 'Annotations_Val_mscoco.zip'
        fname5 = 'Annotations_Train_mscoco.zip'

        url = 'http://visualqa.org/data/mscoco/vqa/'
        build_data.download(os.path.join(dpath, fname1), url + fname1)
        build_data.download(os.path.join(dpath, fname2), url + fname2)
        build_data.download(os.path.join(dpath, fname3), url + fname3)

        build_data.download(os.path.join(dpath, fname4), url + fname4)
        build_data.download(os.path.join(dpath, fname5), url + fname5)

        build_data.untar(dpath, fname1)
        build_data.untar(dpath, fname2)
        build_data.untar(dpath, fname3)
        build_data.untar(dpath, fname4)
        build_data.untar(dpath, fname5)

        # Mark the data as built.
        build_data.mark_done(dpath)
Esempio n. 33
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'MovieDialog')
    version = '1'

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        dpath2 = os.path.join(dpath, 'movie_dialog_dataset', 'task4_reddit')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)
        build_data.make_dir(dpath2)

        # Download the data.
        fname = 'moviedialog.tar.gz'
        url = 'https://s3.amazonaws.com/fair-data/parlai/moviedialog/' + fname
        build_data.download(url, dpath, fname)

        url2 = 'http://tinyurl.com/' + 'p6tyohj'
        build_data.download(url2, dpath2, 'p6tyohj.tgz')

        build_data.untar(dpath, fname)
        build_data.untar(dpath2, 'p6tyohj.tgz')

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
Esempio n. 34
0
def build(opt):
    version = 'v1.0'
    dpath = os.path.join(opt['datapath'], 'QACNN')

    if not build_data.built(dpath, version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'cnn.tgz'
        gd_id = '0BwmD_VLjROrfTTljRDVZMFJnVWM'
        build_data.download_from_google_drive(gd_id,
                                              os.path.join(dpath, fname))
        build_data.untar(dpath, fname)

        create_fb_format(dpath, 'train',
                         os.path.join(dpath, 'cnn', 'questions', 'training'))
        create_fb_format(dpath, 'valid',
                         os.path.join(dpath, 'cnn', 'questions', 'validation'))
        create_fb_format(dpath, 'test',
                         os.path.join(dpath, 'cnn', 'questions', 'test'))

        # Mark the data as built.
        build_data.mark_done(dpath, version)
Esempio n. 35
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'VQA-v2')

    if not build_data.built(dpath):
        print('[building data: ' + dpath + ']')
        build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname1 = 'v2_Questions_Train_mscoco.zip'
        fname2 = 'v2_Questions_Val_mscoco.zip'
        fname3 = 'v2_Questions_Test_mscoco.zip'

        fname4 = 'v2_Annotations_Val_mscoco.zip'
        fname5 = 'v2_Annotations_Train_mscoco.zip'

        url = 'http://visualqa.org/data/mscoco/vqa/'
        build_data.download(url + fname1, dpath, fname1)
        build_data.download(url + fname2, dpath, fname2)
        build_data.download(url + fname3, dpath, fname3)

        build_data.download(url + fname4, dpath, fname4)
        build_data.download(url + fname5, dpath, fname5)

        build_data.untar(dpath, fname1)
        build_data.untar(dpath, fname2)
        build_data.untar(dpath, fname3)
        build_data.untar(dpath, fname4)
        build_data.untar(dpath, fname5)

        # Mark the data as built.
        build_data.mark_done(dpath)
Esempio n. 36
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'WikiQA')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'wikiqa.tar.gz'
        url = 'https://s3.amazonaws.com/fair-data/parlai/wikiqa/' + fname
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        dpext = os.path.join(dpath, 'WikiQACorpus')
        create_fb_format(dpath, 'train',
                         os.path.join(dpext, 'WikiQA-train.tsv'))
        create_fb_format(dpath, 'valid',
                         os.path.join(dpext, 'WikiQA-dev.tsv'))
        create_fb_format(dpath, 'test',
                         os.path.join(dpext, 'WikiQA-test.tsv'))
        create_fb_format(dpath, 'train-filtered',
                         os.path.join(dpext, 'WikiQA-train.tsv'))
        create_fb_format(dpath, 'valid-filtered',
                         os.path.join(dpext, 'WikiQA-dev.tsv'))
        create_fb_format(dpath, 'test-filtered',
                         os.path.join(dpext, 'WikiQA-test.tsv'))

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
Esempio n. 37
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'MCTest')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'mctest.tar.gz'
        url = 'https://s3.amazonaws.com/fair-data/parlai/mctest/' + fname
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        dpext = os.path.join(dpath, 'mctest')
        create_fb_format(dpath, 'train160',
                         os.path.join(dpext, 'MCTest', 'mc160.train'), None)
        create_fb_format(dpath, 'valid160',
                         os.path.join(dpext, 'MCTest', 'mc160.dev'), None)
        create_fb_format(dpath, 'test160',
                         os.path.join(dpext, 'MCTest', 'mc160.test'),
                         os.path.join(dpext, 'MCTestAnswers', 'mc160.test.ans'))
        create_fb_format(dpath, 'train500',
                         os.path.join(dpext, 'MCTest', 'mc500.train'), None)
        create_fb_format(dpath, 'valid500',
                         os.path.join(dpext, 'MCTest', 'mc500.dev'), None)
        create_fb_format(dpath, 'test500',
                         os.path.join(dpext, 'MCTest', 'mc500.test'),
                         os.path.join(dpext, 'MCTestAnswers', 'mc500.test.ans'))

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
Esempio n. 38
0
def build(opt):

    data_path = os.path.join(opt['datapath'], 'DailyDialog')
    version = None

    if not build_data.built(data_path, version_string=version):
        print('[building data: ' + data_path + ']')

        if build_data.built(data_path):
            build_data.remove_dir(data_path)
        build_data.make_dir(data_path)

        fname = 'ijcnlp_dailydialog.zip'
        url = 'http://yanran.li/files/'

        # Download the data.
        # wget http://yanran.li/files/ijcnlp_dailydialog.zip
        # unzip ijcnlp_dailydialog.zip
        # unzip ijcnlp_dailydialog/*.zip

        #build_data.download(url, data_path, fname)
        #build_data.untar(data_path, fname)

        parse_data(os.path.join(data_path, 'ijcnlp_dailydialog/train'),
                   data_path,
                   dataset='train')
        parse_data(os.path.join(data_path, 'ijcnlp_dailydialog/validation'),
                   data_path,
                   dataset='validation')
        parse_data(os.path.join(data_path, 'ijcnlp_dailydialog/test'),
                   data_path,
                   dataset='test')

        # Mark the data as built.
        build_data.mark_done(data_path, version_string=version)
Esempio n. 39
0
def build(opt):
    version = 'v1.0'
    dpath = os.path.join(opt['datapath'], 'QACNN')

    if not build_data.built(dpath, version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'cnn.tgz'
        gd_id = '0BwmD_VLjROrfTTljRDVZMFJnVWM'
        build_data.download_from_google_drive(gd_id, os.path.join(dpath, fname))
        build_data.untar(dpath, fname)

        create_fb_format(dpath, 'train',
                         os.path.join(dpath, 'cnn', 'questions', 'training'))
        create_fb_format(dpath, 'valid',
                         os.path.join(dpath, 'cnn', 'questions', 'validation'))
        create_fb_format(dpath, 'test',
                         os.path.join(dpath, 'cnn', 'questions', 'test'))

        # Mark the data as built.
        build_data.mark_done(dpath, version)
Esempio n. 40
0
def build(opt):
    version = 'v1.0'
    dpath = os.path.join(opt['datapath'], 'Twitter')

    if not build_data.built(dpath, version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname1 = "twitter_en_big.txt.gz.partaa"
        fname2 = "twitter_en_big.txt.gz.partab"
        url = 'https://github.com/Marsan-Ma/chat_corpus/raw/master/'
        build_data.download(url + fname1, dpath, fname1)
        build_data.download(url + fname2, dpath, fname2)

        file1 = os.path.join(dpath, fname1)
        file2 = os.path.join(dpath, fname2)
        file3 = "twitter_en_big.txt.gz"
        outzipfile = os.path.join(dpath, file3)
        build_data.cat(file1, file2, outzipfile)

        import gzip
        with gzip.open(outzipfile, 'r') as f:
            file_content = bytes.decode(f.read())
        data = file_content.split('\n')[2:]
        create_fb_format(data, dpath)
        os.remove(outzipfile)

        # Mark the data as built.
        build_data.mark_done(dpath, version)
Esempio n. 41
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'NarrativeQA')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')

        if build_data.built(dpath):
            # an older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # download the data.
        fname = 'narrative_qa.zip'
        # dataset URL
        url = NARRATIVE_QA_DOWNLOAD_URL
        build_data.download(url, dpath, fname)

        # uncompress it
        build_data.untar(dpath, fname)

        print('downloading stories now')
        base_path = os.path.join(dpath, 'narrativeqa-master')

        download_stories(base_path)

        # move from tmp to stories
        tmp_stories_path = os.path.join(base_path,
                                        'tmp')
        new_stories_path = os.path.join(base_path,
                                        'stories')
        shutil.move(tmp_stories_path, new_stories_path)

        # divide into train, valid and test for summaries
        summaries_csv_path = os.path.join(base_path, 'third_party',
                                          'wikipedia', 'summaries.csv')
        new_path = os.path.join(base_path, 'summaries.csv')
        shutil.move(summaries_csv_path, new_path)

        divide_csv_into_sets(new_path)

        # divide into sets for questions
        questions_path = os.path.join(base_path, 'qaps.csv')
        divide_csv_into_sets(questions_path)

        # divide into sets for documents
        documents_path = os.path.join(base_path, 'documents.csv')
        divide_csv_into_sets(documents_path)

        # move specific set's files into their set's folder
        make_folders(base_path)
        move_files(base_path)

        # move narrativeqa-master to narrative_qa
        new_path = os.path.join(dpath, 'narrative_qa')
        shutil.move(base_path, new_path)

        # mark the data as built
        build_data.mark_done(dpath, version_string=version)
Esempio n. 42
0
def build(opt):
    version = 'v0.9'
    dpath = os.path.join(opt['datapath'], 'VisDial-v0.9')

    if not build_data.built(dpath, version):
        print('[building data: ' + dpath + ']')

        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname1 = 'visdial_0.9_train.zip'
        fname2 = 'visdial_0.9_val.zip'

        url = 'https://computing.ece.vt.edu/~abhshkdz/data/visdial/'
        build_data.download(url + fname1, dpath, fname1)
        build_data.download(url + fname2, dpath, fname2)

        build_data.untar(dpath, fname1)
        build_data.untar(dpath, fname2)

        print('processing unpacked files')
        # Use 1000 examples from training set as validation.
        json1 = os.path.join(dpath, fname1.rsplit('.', 1)[0] + '.json')
        with open(json1) as t_json:
            train_data = json.load(t_json)

        valid_data = train_data.copy()
        valid_data['data'] = train_data['data'].copy()
        valid_data['data']['dialogs'] = []

        # Use constant stride to pick examples.
        num_valid = 1000
        total = len(train_data['data']['dialogs'])
        step = total // (num_valid - 1)
        for i in range(total-1, 0, -step)[:num_valid]:
            valid_data['data']['dialogs'].append(train_data['data']['dialogs'][i])
            del train_data['data']['dialogs'][i]

        train_json = json1.rsplit('.', 1)[0] + '_train.json'
        valid_json = json1.rsplit('.', 1)[0] + '_valid.json'
        with open(train_json, 'w') as t_out, open(valid_json, 'w') as v_out:
            json.dump(train_data, t_out)
            json.dump(valid_data, v_out)
        os.remove(json1)

        # Use validation data as test.
        json2 = os.path.join(dpath, fname2.rsplit('.', 1)[0] + '.json')
        test_json = json2.rsplit('.', 1)[0] + '_test.json'
        build_data.move(json2, test_json)

        # Mark the data as built.
        build_data.mark_done(dpath, version)
Esempio n. 43
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'TriviaQA')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'triviaqa-rc.tar.gz'
        url = 'http://nlp.cs.washington.edu/triviaqa/data/'
        build_data.download(url + fname, dpath, fname)
        build_data.untar(dpath, fname)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
Esempio n. 44
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'dialog-bAbI')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'dialog_babi.tar.gz'
        url = 'https://s3.amazonaws.com/fair-data/parlai/dialog_babi/' + fname
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
Esempio n. 45
0
def build(opt):
    version = 'v1.0'
    dpath = os.path.join(opt['datapath'], 'Persona-Chat')

    if not build_data.built(dpath, version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'personachat.tgz'
        url = 'https://s3.amazonaws.com/fair-data/parlai/personachat/' + fname
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        # Mark the data as built.
        build_data.mark_done(dpath, version)
Esempio n. 46
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'personalized-dialog')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        # https://www.dropbox.com/s/4i9u4y24pt3paba/personalized-dialog-dataset.tar.gz?dl=1
        fname = 'personalized-dialog-dataset.tar.gz'
        url = 'https://www.dropbox.com/s/4i9u4y24pt3paba/' + fname + '?dl=1'
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
Esempio n. 47
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'SQuAD')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname1 = 'train-v1.1.json'
        fname2 = 'dev-v1.1.json'
        url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/'
        build_data.download(url + fname1, dpath, fname1)
        build_data.download(url + fname2, dpath, fname2)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
Esempio n. 48
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'OpenSubtitles')
    version = '1'

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        url = ('http://opus.lingfil.uu.se/download.php?f=OpenSubtitles/en.tar.gz')
        build_data.download(url, dpath, 'OpenSubtitles.tar.gz')
        build_data.untar(dpath, 'OpenSubtitles.tar.gz')

        create_fb_format(os.path.join(dpath, 'OpenSubtitles', 'en'), dpath)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
Esempio n. 49
0
def build(opt):
    data_path = os.path.join(opt['datapath'], 'DialogueQE')
    version = '1501534800'

    if not build_data.built(data_path, version_string=version):
        print('[building data: ' + data_path + ']')

        if build_data.built(data_path):
            build_data.remove_dir(data_path)
        build_data.make_dir(data_path)

        fname = 'data_' + version + '.tar.gz'
        url = 'https://raw.githubusercontent.com/deepmipt/turing-data/master/' + fname
        build_data.download(url, data_path, fname)
        build_data.untar(data_path, fname)

        os.rename(os.path.join(data_path, 'data_train_' + version + '.json'), os.path.join(data_path, 'train.json'))
        os.rename(os.path.join(data_path, 'data_test_' + version + '.json'), os.path.join(data_path, 'test.json'))

        build_data.mark_done(data_path, version_string=version)
Esempio n. 50
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'CLEVR')
    version = 'v1.0'

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        # An older version exists, so remove these outdated files.
        if build_data.built(dpath):
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'CLEVR_v1.0.zip'
        url = 'https://s3-us-west-1.amazonaws.com/clevr/'

        build_data.download(url + fname, dpath, fname)
        build_data.untar(dpath, fname)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
Esempio n. 51
0
def build(opt):
    # Depends upon another dataset, wikimovies, build that first.
    wikimovies_build.build(opt)

    dpath = os.path.join(opt['datapath'], 'DBLL')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'dbll.tgz'
        url = 'https://s3.amazonaws.com/fair-data/parlai/dbll/' + fname
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
Esempio n. 52
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'COPA')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')

        if build_data.built(dpath):
            # an older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # download the data.
        fname = 'COPA-resources.tgz'
        # dataset URL
        url = 'http://people.ict.usc.edu/~gordon/downloads/' + fname
        build_data.download(url, dpath, fname)

        # uncompress it
        build_data.untar(dpath, fname)

        # mark the data as built
        build_data.mark_done(dpath, version_string=version)
Esempio n. 53
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'negotiation')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')

        # make a clean directory if needed
        if build_data.built(dpath):
            # an older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data from github
        fname = 'negotiation.zip'
        url = ('https://github.com/facebookresearch/end-to-end-negotiator/'
               'archive/master.zip')
        print('[downloading data from: ' + url + ']')
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        # Mark as done
        build_data.mark_done(dpath, version_string=version)
Esempio n. 54
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'InsuranceQA')
    version = '1'

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data from github.
        fname = 'insuranceqa.zip'
        url = 'https://github.com/shuzi/insuranceQA/archive/master.zip'
        print('[downloading data from: ' + url + ']')
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        ParseInsuranceQAV1.build(dpath)
        ParseInsuranceQAV2.build(dpath)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
Esempio n. 55
0
def build(opt):
    version = 'v1.0'
    dpath = os.path.join(opt['datapath'], 'SCAN')

    if not build_data.built(dpath, version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'scan.tgz'
        url = 'https://s3.amazonaws.com/fair-data/parlai/scan/' + fname
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        ext = os.path.join('dailymail', 'questions')
        create_fb_format(dpath, 'train', os.path.join(dpath, 'tasks_train_simple.txt'))
        create_fb_format(dpath, 'valid', os.path.join(dpath, 'tasks_train_simple.txt'))
        create_fb_format(dpath, 'test', os.path.join(dpath, 'tasks_test_simple.txt'))

        # Mark the data as built.
        build_data.mark_done(dpath, version)
Esempio n. 56
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'CornellMovie')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'cornell_movie_dialogs_corpus.zip'
        url = 'http://www.mpi-sws.org/~cristian/data/' + fname
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        dpext = os.path.join(dpath, 'cornell movie-dialogs corpus')
        create_fb_format(os.path.join(dpext, 'movie_lines.txt'),
                         os.path.join(dpext, 'movie_conversations.txt'),
                         dpath)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)