def build(opt): dpath = os.path.join(opt['datapath'], 'wmt') version = 'None' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. train_r_fnames = ('train.en', 'train.de') train_w_fname = 'en_de_train.txt' valid_w_fname = 'en_de_valid.txt' test_r_fnames = ('newstest2014.en', 'newstest2014.de') test_w_fname = 'en_de_test.txt' train_zip = readFiles(dpath, train_r_fnames) numpy.random.shuffle(train_zip) with open(os.path.join(dpath, valid_w_fname), 'w') as f: for de_sent, en_sent in train_zip[:30000]: f.write('1 ' + en_sent + '\t' + de_sent + '\n') with open(os.path.join(dpath, train_w_fname), 'w') as f: for de_sent, en_sent in train_zip[30000:]: f.write('1 ' + en_sent + '\t' + de_sent + '\n') test_zip = readFiles(dpath, test_r_fnames) with open(os.path.join(dpath, test_w_fname), 'w') as f: for de_sent, en_sent in test_zip: f.write('1 ' + en_sent + '\t' + de_sent + '\n') # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'COCO_2014_Caption') version = '1.0' # check if data had been previously built if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') # make a clean directory if needed if build_data.built(dpath): # an older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # download the data. fname = 'dataset_coco.tgz' # dataset URL url = 'http://parl.ai/downloads/coco_caption/' build_data.download(url + fname, dpath, fname) # uncompress it build_data.untar(dpath, fname) # mark the data as built build_data.mark_done(dpath, version_string=version)
def build(opt): version = 'v1.0' dpath = os.path.join(opt['datapath'], 'Twitter') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname1 = "twitter_en_big.txt.gz.partaa" fname2 = "twitter_en_big.txt.gz.partab" url = 'https://github.com/Marsan-Ma/chat_corpus/raw/master/' build_data.download(url + fname1, dpath, fname1) build_data.download(url + fname2, dpath, fname2) file1 = os.path.join(dpath, fname1) file2 = os.path.join(dpath, fname2) file3 = "twitter_en_big.txt.gz" outzipfile= os.path.join(dpath, file3) build_data.cat(file1, file2, outzipfile) import gzip with gzip.open(outzipfile, 'r') as f: file_content = bytes.decode(f.read()) data = file_content.split('\n')[2:] create_fb_format(data, dpath) os.remove(outzipfile) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): dpath = os.path.join(opt['datapath'], 'NaturalQuestionsOpen') version = str(VERSION) if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. for downloadable_file in RESOURCES: downloadable_file.download_file(dpath) if ".gz" in downloadable_file.file_name: with gzip.open( os.path.join(dpath, downloadable_file.file_name), 'rb' ) as fin: with open( os.path.join(dpath, downloadable_file.file_name[:-3]), 'wb' ) as fout: shutil.copyfileobj(fin, fout) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): """Prepares datasets and other dependencies for NERTeacher""" version = '1.1' dpath = os.path.join(opt['datapath'], 'ner') # check if data had been previously built raw_path = os.path.abspath(opt['raw_dataset_path'] or ".") if len([f for f in os.listdir(raw_path) if f.endswith(".iob")]) == 0: if not build_data.built(dpath, version_string=version): print('[target data path: ' + dpath + ']') # make a clean directory if needed if build_data.built(dpath): # an older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) ds_path = os.environ.get('DATASETS_URL') file_name = 'gareev.tar.gz' if not ds_path: raise RuntimeError( "Looks like the `DATASETS_URL` variable is set incorrectly" ) print('Trying to download a dataset %s from the repository' % file_name) url = urllib.parse.urljoin(ds_path, file_name) build_data.download(url, dpath, file_name) build_data.untar(dpath, file_name) print('Downloaded a %s dataset' % file_name) # mark the data as built build_data.mark_done(dpath, version_string=version) opt['raw_dataset_path'] = dpath print("Use dataset from path: %s" % repr(opt['raw_dataset_path'])) create_heap_file(opt['raw_dataset_path'])
def build(datapath, use_history): dpath = os.path.join(datapath, 'OpenSubtitles2018') if not use_history: dpath += '_no_history' version = '1' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) untar_path = os.path.join(dpath, 'OpenSubtitles', 'xml', 'en') if len(glob.glob(untar_path + '/*/*/*.xml')) != NUM_SUBTITLES_FILES: # Download the data. url = 'https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/xml/en.zip' build_data.download(url, dpath, 'OpenSubtitles2018.zip') build_data.untar(dpath, 'OpenSubtitles2018.zip') create_fb_format(untar_path, dpath, use_history) # Mark the data as built. build_data.mark_done(dpath, version_string=version) return dpath
def build(opt, subtask=None): # get path to data directory dpath = os.path.join(opt['datapath'], 'Reddit', subtask) # check if data had been previously built if not build_data.built(dpath, version_string=subtask): print('[building data: ' + dpath + ']') # make a clean directory if needed if build_data.built(dpath): # an older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # don't download the data. fname = os.environ['HOME'] + '/data/anime.pickle' if subtask: fname = os.environ['HOME'] + '/data/' + subtask + '.pickle' data = pickle.load(open(fname, 'rb')) # create_fb_format(data, dpath, subtask) create_fb_format_by_link(data, dpath, subtask) # mark the data as built build_data.mark_done(dpath, version_string=subtask)
def build(opt): version = 'v1.1' dpath = os.path.join(opt['datapath'], 'Twitter') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. for downloadable_file in RESOURCES: downloadable_file.download_file(dpath) file1 = os.path.join(dpath, RESOURCES[0].file_name) file2 = os.path.join(dpath, RESOURCES[1].file_name) concat = io.BytesIO() for fn in [file1, file2]: with PathManager.open(fn, 'rb') as rawf: concat.write(rawf.read()) with gzip.GzipFile(fileobj=io.BytesIO(concat.getvalue())) as f: file_content = bytes.decode(f.read()) data = file_content.split('\n')[2:] create_fb_format(data, dpath) PathManager.rm(file1) PathManager.rm(file2) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): version = '0.2' dpath = os.path.join(opt['datapath'], 'ConvAI2_wild_evaluation') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. for downloadable_file in RESOURCES: downloadable_file.download_file(dpath) output_fname = 'convai2_wild_evaluation.json' output_path = os.path.join(dpath, output_fname) with open(output_path, 'r') as data_f: data = json.load(data_f) make_parlai_format(data, dpath) os.remove(output_path) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): dpath = os.path.join(opt['datapath'], 'MCTest') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'mctest.tar.gz' url = 'http://parl.ai/downloads/mctest/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) dpext = os.path.join(dpath, 'mctest') create_fb_format(dpath, 'train160', os.path.join(dpext, 'MCTest', 'mc160.train'), None) create_fb_format(dpath, 'valid160', os.path.join(dpext, 'MCTest', 'mc160.dev'), None) create_fb_format(dpath, 'test160', os.path.join(dpext, 'MCTest', 'mc160.test'), os.path.join(dpext, 'MCTestAnswers', 'mc160.test.ans')) create_fb_format(dpath, 'train500', os.path.join(dpext, 'MCTest', 'mc500.train'), None) create_fb_format(dpath, 'valid500', os.path.join(dpext, 'MCTest', 'mc500.dev'), None) create_fb_format(dpath, 'test500', os.path.join(dpext, 'MCTest', 'mc500.test'), os.path.join(dpext, 'MCTestAnswers', 'mc500.test.ans')) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = opt['datapath'] + "/VQA-COCO2014/" if not build_data.built(dpath): print("[building data: " + dpath + "]") build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname1 = "Questions_Train_mscoco.zip" fname2 = "Questions_Val_mscoco.zip" fname3 = "Questions_Test_mscoco.zip" fname4 = "Annotations_Val_mscoco.zip" fname5 = "Annotations_Train_mscoco.zip" url = "http://visualqa.org/data/mscoco/vqa/" build_data.download(dpath, url + fname1) build_data.download(dpath, url + fname2) build_data.download(dpath, url + fname3) build_data.download(dpath, url + fname4) build_data.download(dpath, url + fname5) build_data.untar(dpath, fname1) build_data.untar(dpath, fname2) build_data.untar(dpath, fname3) build_data.untar(dpath, fname4) build_data.untar(dpath, fname5) buildImage(dpath) # Mark the data as built. build_data.mark_done(dpath)
def build(opt): dpath, version = download(opt) if 'light_use_speech_prefix' not in opt: opt['light_use_speech_prefix'] = True # create particular instance of dataset depending on flags.. fields = [ 'taskname', 'setting', 'objects', 'person_names', 'persona', 'emote', 'speech', 'action', 'affordances', 'repeat', 'cands', 'current_self_output', 'clip_cands', 'speech_prefix', ] fpath = '' for f in fields: fpath += f + str(opt['light_use_' + f]) + "_" dpath2 = os.path.join(opt['datapath'], 'light_dialogue', fpath[:-1]) if not build_data.built(dpath2, version): if build_data.built(dpath2): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath2) build_data.make_dir(dpath2) fname = 'light_data.pkl' fname2 = 'light_unseen_data.pkl' build_from_db(opt, dpath, dpath2, fname, fname2) # Mark the data as built. build_data.mark_done(dpath2, version)
def build(opt): dpath = os.path.join(opt['datapath'], 'VQA-v2') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') # An older version exists, so remove these outdated files. if build_data.built(dpath): build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname1 = 'v2_Questions_Train_mscoco.zip' fname2 = 'v2_Questions_Val_mscoco.zip' fname3 = 'v2_Questions_Test_mscoco.zip' fname4 = 'v2_Annotations_Val_mscoco.zip' fname5 = 'v2_Annotations_Train_mscoco.zip' url = 'http://visualqa.org/data/mscoco/vqa/' build_data.download(url + fname1, dpath, fname1) build_data.download(url + fname2, dpath, fname2) build_data.download(url + fname3, dpath, fname3) build_data.download(url + fname4, dpath, fname4) build_data.download(url + fname5, dpath, fname5) build_data.untar(dpath, fname1) build_data.untar(dpath, fname2) build_data.untar(dpath, fname3) build_data.untar(dpath, fname4) build_data.untar(dpath, fname5) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'HotpotQA') if not build_data.built(dpath, version_string=VERSION): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. build_data.download(URL + TRAIN_FILENAME, dpath, TRAIN_FILENAME) build_data.download(URL + DEV_DISTRACTOR_FILENAME, dpath, DEV_DISTRACTOR_FILENAME) build_data.download(URL + DEV_FULLWIKI_FILENAME, dpath, DEV_FULLWIKI_FILENAME) with open(os.path.join(dpath, TRAIN_FILENAME)) as f: data = json.load(f) make_parlai_format(dpath, 'train', data) with open(os.path.join(dpath, DEV_DISTRACTOR_FILENAME)) as f: data = json.load(f) make_parlai_format(dpath, 'valid_distractor', data) with open(os.path.join(dpath, DEV_FULLWIKI_FILENAME)) as f: data = json.load(f) make_parlai_format(dpath, 'valid_fullwiki', data) # Mark the data as built. build_data.mark_done(dpath, version_string=VERSION)
def build(opt): dpath = os.path.join(opt['datapath'], 'MS_MARCO') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data url = "https://msmarco.blob.core.windows.net/msmarco/" fname = "train_v1.1.json.gz" build_data.download(url + fname, dpath, 'train.gz') fname = "dev_v1.1.json.gz" build_data.download(url + fname, dpath, 'valid.gz') fname = "test_public_v1.1.json.gz" build_data.download(url + fname, dpath, 'test.gz') create_fb_format(dpath, "train", os.path.join(dpath, 'train.gz')) create_fb_format(dpath, "valid", os.path.join(dpath, 'valid.gz')) create_fb_format(dpath, "test", os.path.join(dpath, 'test.gz')) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): version = 'v1.0' dpath = os.path.join(opt['datapath'], 'SCAN') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'scan.tgz' url = 'http://parl.ai/downloads/scan/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) create_fb_format(dpath, 'train', os.path.join(dpath, 'tasks_train_simple.txt')) create_fb_format(dpath, 'valid', os.path.join(dpath, 'tasks_train_simple.txt')) create_fb_format(dpath, 'test', os.path.join(dpath, 'tasks_test_simple.txt')) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): dpath = os.path.join(opt['datapath'], 'CornellMovie') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'cornell_movie_dialogs_corpus.tgz' url = 'http://parl.ai/downloads/cornell_movie/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) dpext = os.path.join(dpath, 'cornell movie-dialogs corpus') create_fb_format( os.path.join(dpext, 'movie_lines.txt'), os.path.join(dpext, 'movie_conversations.txt'), dpath, ) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): # get path to data directory dpath = os.path.join(opt['datapath'], 'google_sgd') # define version if any version = "1.0" # check if data had been previously built if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') # make a clean directory if needed if build_data.built(dpath): # an older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. for split_type in ['train', 'dev', 'test']: outpath = os.path.join(dpath, split_type) filename = 'schema.json' url = f'{ROOT_URL}/{split_type}/{filename}' build_data.make_dir(outpath) build_data.download(url, outpath, filename) for file_id in range(1, DATA_LEN[split_type] + 1): filename = f'dialogues_{file_id:03d}.json' url = f'{ROOT_URL}/{split_type}/{filename}' build_data.download(url, outpath, filename) # mark the data as built build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'QuAC') version = VERSION if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. for downloadable_file in RESOURCES: downloadable_file.download_file(dpath) with PathManager.open(os.path.join(dpath, RESOURCES[0].file_name)) as f: data = json.load(f)['data'] make_parlai_format(dpath, 'train', data) with PathManager.open(os.path.join(dpath, RESOURCES[1].file_name)) as f: data = json.load(f)['data'] make_parlai_format(dpath, 'valid', data) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): # get path to data directory dpath = os.path.join(opt['datapath'], 'taskmaster-1') # define version if any version = "1.0" # check if data had been previously built if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') # make a clean directory if needed if build_data.built(dpath): # an older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # download the data. gsl_url = 'https://storage.googleapis.com/dialog-data-corpus/TASKMASTER-1-2019/' fname_self_dialogs = 'self-dialogs.json' fname_woz_dialogs = 'woz-dialogs.json' url_self_dialogs = gsl_url + fname_self_dialogs # dataset URL url_woz_dialogs = gsl_url + fname_woz_dialogs # dataset URL build_data.download(url_self_dialogs, dpath, fname_self_dialogs) build_data.download(url_woz_dialogs, dpath, fname_woz_dialogs) # mark the data as built build_data.mark_done(dpath, version_string=version)
def buildImage(opt): dpath = os.path.join(opt['datapath'], 'COCO-IMG') if not build_data.built(dpath): print('[building image data: ' + dpath + ']') build_data.remove_dir(dpath) build_data.make_dir(dpath) # download the image data. fname1 = 'train2014.zip' fname2 = 'val2014.zip' fname3 = 'test2014.zip' url = 'http://msvocds.blob.core.windows.net/coco2014/' build_data.download(url + fname1, dpath, fname1) build_data.download(url + fname2, dpath, fname2) build_data.download(url + fname3, dpath, fname3) build_data.untar(dpath, fname1) build_data.untar(dpath, fname2) build_data.untar(dpath, fname3) # Mark the data as built. build_data.mark_done(dpath)
def build(opt): dpath = os.path.join(opt['datapath'], 'MS_MARCO') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data url = "https://msmarco.blob.core.windows.net/msmarco/" fname = "train_v1.1.json.gz" build_data.download(url + fname, dpath, 'train.gz') fname = "dev_v1.1.json.gz" build_data.download(url + fname, dpath, 'valid.gz') fname = "test_public_v1.1.json.gz" build_data.download(url + fname, dpath, 'test.gz') create_fb_format(dpath, "train", os.path.join(dpath, 'train.gz')) create_fb_format(dpath, "valid", os.path.join(dpath, 'valid.gz')) create_fb_format(dpath, "test", os.path.join(dpath, 'test.gz')) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(datapath, use_history): dpath = os.path.join(datapath, 'OpenSubtitles2018') if not use_history: dpath += '_no_history' version = '1' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) untar_path = os.path.join(dpath, 'OpenSubtitles2018', 'xml', 'en') if len(glob.glob(untar_path + '/*/*/*.xml.gz')) != NUM_SUBTITLES_FILES: # Download the data. url = ( 'http://opus.lingfil.uu.se/download.php?f=OpenSubtitles2018/en.tar.gz' ) build_data.download(url, dpath, 'OpenSubtitles2018.tar.gz') build_data.untar(dpath, 'OpenSubtitles2018.tar.gz') create_fb_format(untar_path, dpath, use_history) # Mark the data as built. build_data.mark_done(dpath, version_string=version) return dpath
def build(opt): dpath = os.path.join(opt['datapath'], 'COCO_2015_Caption') version = None # check if data had been previously built if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') # make a clean directory if needed if build_data.built(dpath): # an older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # download the data. fname = 'image_info_test2015.zip' # dataset URL url = 'http://images.cocodataset.org/annotations/' build_data.download(url + fname, dpath, fname) # uncompress it build_data.untar(dpath, fname) # mark the data as built build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'WikiQA') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'wikiqa.tar.gz' url = 'https://s3.amazonaws.com/fair-data/parlai/wikiqa/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) dpext = os.path.join(dpath, 'WikiQACorpus') create_fb_format(dpath, 'train', os.path.join(dpext, 'WikiQA-train.tsv')) create_fb_format(dpath, 'valid', os.path.join(dpext, 'WikiQA-dev.tsv')) create_fb_format(dpath, 'test', os.path.join(dpext, 'WikiQA-test.tsv')) create_fb_format(dpath, 'train-filtered', os.path.join(dpext, 'WikiQA-train.tsv')) create_fb_format(dpath, 'valid-filtered', os.path.join(dpext, 'WikiQA-dev.tsv')) create_fb_format(dpath, 'test-filtered', os.path.join(dpext, 'WikiQA-test.tsv')) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'wmt') version = 'None' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. train_r_fnames = ('train.en', 'train.de') train_w_fname = 'en_de_train.txt' valid_w_fname = 'en_de_valid.txt' test_r_fnames = ('newstest2014.en', 'newstest2014.de') test_w_fname = 'en_de_test.txt' train_zip = readFiles(dpath, train_r_fnames) numpy.random.shuffle(train_zip) with open(os.path.join(dpath, valid_w_fname), 'w') as f: for de_sent, en_sent in train_zip[:30000]: f.write("1 "+en_sent+"\t"+de_sent+"\n") with open(os.path.join(dpath, train_w_fname), 'w') as f: for de_sent, en_sent in train_zip[30000:]: f.write("1 "+en_sent+"\t"+de_sent+"\n") test_zip = readFiles(dpath, test_r_fnames) with open(os.path.join(dpath, test_w_fname), 'w') as f: for de_sent, en_sent in test_zip: f.write("1 "+en_sent+"\t"+de_sent+"\n") # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'wmt') version = 'None' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fnames = [('train.en','train.de', 'en_de_train.txt'), ('newstest2014.en','newstest2014.de', 'en_de_test.txt')] for (en_fname, de_fname, w_fname) in fnames: url_base = 'https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/' en_url = url_base + en_fname de_url = url_base + de_fname build_data.download(en_url, dpath, en_fname) build_data.download(de_url, dpath, de_fname) with open(os.path.join(dpath, en_fname), 'r') as f: en = [l[:-1] for l in f] with open(os.path.join(dpath, de_fname), 'r') as f: de = [l[:-1] for l in f] with open(os.path.join(dpath, w_fname), 'w') as f: for de_sent,en_sent in zip(de,en): f.write("1 "+en_sent+"\t"+de_sent+"\n") # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def buildImage(opt): dpath = os.path.join(opt['datapath'], 'COCO-IMG-2014') version = '1' if not build_data.built(dpath, version_string=version): print('[building image data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the image data. fname1 = 'train2014.zip' fname2 = 'val2014.zip' fname3 = 'test2014.zip' url = 'http://parl.ai/downloads/COCO-IMG/' build_data.download(url + fname1, dpath, fname1) build_data.download(url + fname2, dpath, fname2) build_data.download(url + fname3, dpath, fname3) build_data.untar(dpath, fname1) build_data.untar(dpath, fname2) build_data.untar(dpath, fname3) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def buildImage(opt): dpath = os.path.join(opt['datapath'], 'COCO-IMG') version = '1' if not build_data.built(dpath, version_string=version): print('[building image data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the image data. fname1 = 'train2014.zip' fname2 = 'val2014.zip' fname3 = 'test2015.zip' url = 'https://s3.amazonaws.com/fair-data/parlai/COCO-IMG/' build_data.download(url + fname1, dpath, fname1) build_data.download(url + fname2, dpath, fname2) build_data.download(url + fname3, dpath, fname3) build_data.untar(dpath, fname1) build_data.untar(dpath, fname2) build_data.untar(dpath, fname3) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(datapath, use_history): dpath = os.path.join(datapath, 'OpenSubtitles2018') if not use_history: dpath += '_no_history' version = '1' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) untar_path = os.path.join(dpath, 'OpenSubtitles', 'xml', 'it') if len(glob.glob(untar_path + '/*/*/*.xml')) != NUM_SUBTITLES_FILES: # Download the data. for downloadable_file in RESOURCES: downloadable_file.download_file(dpath) create_fb_format(untar_path, dpath, use_history) # Mark the data as built. build_data.mark_done(dpath, version_string=version) return dpath
def build(opt): dpath = os.path.join(opt['datapath'], 'WebQuestions') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. url = ('https://worksheets.codalab.org/rest/bundles/' + '0x4a763f8cde224c2da592b75f29e2f5c2/contents/blob/') build_data.download(url, dpath, 'train.json') url = ('https://worksheets.codalab.org/rest/bundles/' + '0xe7bac352fce7448c9ef238fb0a297ec2/contents/blob/') build_data.download(url, dpath, 'test.json') create_fb_format(dpath, 'train', os.path.join(dpath, 'train.json')) create_fb_format(dpath, 'valid', os.path.join(dpath, 'train.json')) create_fb_format(dpath, 'test', os.path.join(dpath, 'test.json')) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'VQA-v1') if not build_data.built(dpath): print('[building data: ' + dpath + ']') for item in os.listdir(dpath): item = os.path.join(dpath, item) if os.path.isdir(item): build_data.remove_dir(item) build_data.make_dir(dpath) # Download the data. fname1 = 'Questions_Train_mscoco.zip' fname2 = 'Questions_Val_mscoco.zip' fname3 = 'Questions_Test_mscoco.zip' fname4 = 'Annotations_Val_mscoco.zip' fname5 = 'Annotations_Train_mscoco.zip' url = 'http://visualqa.org/data/mscoco/vqa/' build_data.download(os.path.join(dpath, fname1), url + fname1) build_data.download(os.path.join(dpath, fname2), url + fname2) build_data.download(os.path.join(dpath, fname3), url + fname3) build_data.download(os.path.join(dpath, fname4), url + fname4) build_data.download(os.path.join(dpath, fname5), url + fname5) build_data.untar(dpath, fname1) build_data.untar(dpath, fname2) build_data.untar(dpath, fname3) build_data.untar(dpath, fname4) build_data.untar(dpath, fname5) # Mark the data as built. build_data.mark_done(dpath)
def build(opt): dpath = os.path.join(opt['datapath'], 'MovieDialog') version = '1' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') dpath2 = os.path.join(dpath, 'movie_dialog_dataset', 'task4_reddit') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) build_data.make_dir(dpath2) # Download the data. fname = 'moviedialog.tar.gz' url = 'https://s3.amazonaws.com/fair-data/parlai/moviedialog/' + fname build_data.download(url, dpath, fname) url2 = 'http://tinyurl.com/' + 'p6tyohj' build_data.download(url2, dpath2, 'p6tyohj.tgz') build_data.untar(dpath, fname) build_data.untar(dpath2, 'p6tyohj.tgz') # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): version = 'v1.0' dpath = os.path.join(opt['datapath'], 'QACNN') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'cnn.tgz' gd_id = '0BwmD_VLjROrfTTljRDVZMFJnVWM' build_data.download_from_google_drive(gd_id, os.path.join(dpath, fname)) build_data.untar(dpath, fname) create_fb_format(dpath, 'train', os.path.join(dpath, 'cnn', 'questions', 'training')) create_fb_format(dpath, 'valid', os.path.join(dpath, 'cnn', 'questions', 'validation')) create_fb_format(dpath, 'test', os.path.join(dpath, 'cnn', 'questions', 'test')) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): dpath = os.path.join(opt['datapath'], 'VQA-v2') if not build_data.built(dpath): print('[building data: ' + dpath + ']') build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname1 = 'v2_Questions_Train_mscoco.zip' fname2 = 'v2_Questions_Val_mscoco.zip' fname3 = 'v2_Questions_Test_mscoco.zip' fname4 = 'v2_Annotations_Val_mscoco.zip' fname5 = 'v2_Annotations_Train_mscoco.zip' url = 'http://visualqa.org/data/mscoco/vqa/' build_data.download(url + fname1, dpath, fname1) build_data.download(url + fname2, dpath, fname2) build_data.download(url + fname3, dpath, fname3) build_data.download(url + fname4, dpath, fname4) build_data.download(url + fname5, dpath, fname5) build_data.untar(dpath, fname1) build_data.untar(dpath, fname2) build_data.untar(dpath, fname3) build_data.untar(dpath, fname4) build_data.untar(dpath, fname5) # Mark the data as built. build_data.mark_done(dpath)
def build(opt): dpath = os.path.join(opt['datapath'], 'WikiQA') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'wikiqa.tar.gz' url = 'https://s3.amazonaws.com/fair-data/parlai/wikiqa/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) dpext = os.path.join(dpath, 'WikiQACorpus') create_fb_format(dpath, 'train', os.path.join(dpext, 'WikiQA-train.tsv')) create_fb_format(dpath, 'valid', os.path.join(dpext, 'WikiQA-dev.tsv')) create_fb_format(dpath, 'test', os.path.join(dpext, 'WikiQA-test.tsv')) create_fb_format(dpath, 'train-filtered', os.path.join(dpext, 'WikiQA-train.tsv')) create_fb_format(dpath, 'valid-filtered', os.path.join(dpext, 'WikiQA-dev.tsv')) create_fb_format(dpath, 'test-filtered', os.path.join(dpext, 'WikiQA-test.tsv')) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'MCTest') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'mctest.tar.gz' url = 'https://s3.amazonaws.com/fair-data/parlai/mctest/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) dpext = os.path.join(dpath, 'mctest') create_fb_format(dpath, 'train160', os.path.join(dpext, 'MCTest', 'mc160.train'), None) create_fb_format(dpath, 'valid160', os.path.join(dpext, 'MCTest', 'mc160.dev'), None) create_fb_format(dpath, 'test160', os.path.join(dpext, 'MCTest', 'mc160.test'), os.path.join(dpext, 'MCTestAnswers', 'mc160.test.ans')) create_fb_format(dpath, 'train500', os.path.join(dpext, 'MCTest', 'mc500.train'), None) create_fb_format(dpath, 'valid500', os.path.join(dpext, 'MCTest', 'mc500.dev'), None) create_fb_format(dpath, 'test500', os.path.join(dpext, 'MCTest', 'mc500.test'), os.path.join(dpext, 'MCTestAnswers', 'mc500.test.ans')) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): data_path = os.path.join(opt['datapath'], 'DailyDialog') version = None if not build_data.built(data_path, version_string=version): print('[building data: ' + data_path + ']') if build_data.built(data_path): build_data.remove_dir(data_path) build_data.make_dir(data_path) fname = 'ijcnlp_dailydialog.zip' url = 'http://yanran.li/files/' # Download the data. # wget http://yanran.li/files/ijcnlp_dailydialog.zip # unzip ijcnlp_dailydialog.zip # unzip ijcnlp_dailydialog/*.zip #build_data.download(url, data_path, fname) #build_data.untar(data_path, fname) parse_data(os.path.join(data_path, 'ijcnlp_dailydialog/train'), data_path, dataset='train') parse_data(os.path.join(data_path, 'ijcnlp_dailydialog/validation'), data_path, dataset='validation') parse_data(os.path.join(data_path, 'ijcnlp_dailydialog/test'), data_path, dataset='test') # Mark the data as built. build_data.mark_done(data_path, version_string=version)
def build(opt): version = 'v1.0' dpath = os.path.join(opt['datapath'], 'QACNN') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'cnn.tgz' gd_id = '0BwmD_VLjROrfTTljRDVZMFJnVWM' build_data.download_from_google_drive(gd_id, os.path.join(dpath, fname)) build_data.untar(dpath, fname) create_fb_format(dpath, 'train', os.path.join(dpath, 'cnn', 'questions', 'training')) create_fb_format(dpath, 'valid', os.path.join(dpath, 'cnn', 'questions', 'validation')) create_fb_format(dpath, 'test', os.path.join(dpath, 'cnn', 'questions', 'test')) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): version = 'v1.0' dpath = os.path.join(opt['datapath'], 'Twitter') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname1 = "twitter_en_big.txt.gz.partaa" fname2 = "twitter_en_big.txt.gz.partab" url = 'https://github.com/Marsan-Ma/chat_corpus/raw/master/' build_data.download(url + fname1, dpath, fname1) build_data.download(url + fname2, dpath, fname2) file1 = os.path.join(dpath, fname1) file2 = os.path.join(dpath, fname2) file3 = "twitter_en_big.txt.gz" outzipfile = os.path.join(dpath, file3) build_data.cat(file1, file2, outzipfile) import gzip with gzip.open(outzipfile, 'r') as f: file_content = bytes.decode(f.read()) data = file_content.split('\n')[2:] create_fb_format(data, dpath) os.remove(outzipfile) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): dpath = os.path.join(opt['datapath'], 'NarrativeQA') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # an older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # download the data. fname = 'narrative_qa.zip' # dataset URL url = NARRATIVE_QA_DOWNLOAD_URL build_data.download(url, dpath, fname) # uncompress it build_data.untar(dpath, fname) print('downloading stories now') base_path = os.path.join(dpath, 'narrativeqa-master') download_stories(base_path) # move from tmp to stories tmp_stories_path = os.path.join(base_path, 'tmp') new_stories_path = os.path.join(base_path, 'stories') shutil.move(tmp_stories_path, new_stories_path) # divide into train, valid and test for summaries summaries_csv_path = os.path.join(base_path, 'third_party', 'wikipedia', 'summaries.csv') new_path = os.path.join(base_path, 'summaries.csv') shutil.move(summaries_csv_path, new_path) divide_csv_into_sets(new_path) # divide into sets for questions questions_path = os.path.join(base_path, 'qaps.csv') divide_csv_into_sets(questions_path) # divide into sets for documents documents_path = os.path.join(base_path, 'documents.csv') divide_csv_into_sets(documents_path) # move specific set's files into their set's folder make_folders(base_path) move_files(base_path) # move narrativeqa-master to narrative_qa new_path = os.path.join(dpath, 'narrative_qa') shutil.move(base_path, new_path) # mark the data as built build_data.mark_done(dpath, version_string=version)
def build(opt): version = 'v0.9' dpath = os.path.join(opt['datapath'], 'VisDial-v0.9') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname1 = 'visdial_0.9_train.zip' fname2 = 'visdial_0.9_val.zip' url = 'https://computing.ece.vt.edu/~abhshkdz/data/visdial/' build_data.download(url + fname1, dpath, fname1) build_data.download(url + fname2, dpath, fname2) build_data.untar(dpath, fname1) build_data.untar(dpath, fname2) print('processing unpacked files') # Use 1000 examples from training set as validation. json1 = os.path.join(dpath, fname1.rsplit('.', 1)[0] + '.json') with open(json1) as t_json: train_data = json.load(t_json) valid_data = train_data.copy() valid_data['data'] = train_data['data'].copy() valid_data['data']['dialogs'] = [] # Use constant stride to pick examples. num_valid = 1000 total = len(train_data['data']['dialogs']) step = total // (num_valid - 1) for i in range(total-1, 0, -step)[:num_valid]: valid_data['data']['dialogs'].append(train_data['data']['dialogs'][i]) del train_data['data']['dialogs'][i] train_json = json1.rsplit('.', 1)[0] + '_train.json' valid_json = json1.rsplit('.', 1)[0] + '_valid.json' with open(train_json, 'w') as t_out, open(valid_json, 'w') as v_out: json.dump(train_data, t_out) json.dump(valid_data, v_out) os.remove(json1) # Use validation data as test. json2 = os.path.join(dpath, fname2.rsplit('.', 1)[0] + '.json') test_json = json2.rsplit('.', 1)[0] + '_test.json' build_data.move(json2, test_json) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): dpath = os.path.join(opt['datapath'], 'TriviaQA') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'triviaqa-rc.tar.gz' url = 'http://nlp.cs.washington.edu/triviaqa/data/' build_data.download(url + fname, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'dialog-bAbI') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'dialog_babi.tar.gz' url = 'https://s3.amazonaws.com/fair-data/parlai/dialog_babi/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): version = 'v1.0' dpath = os.path.join(opt['datapath'], 'Persona-Chat') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'personachat.tgz' url = 'https://s3.amazonaws.com/fair-data/parlai/personachat/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): dpath = os.path.join(opt['datapath'], 'personalized-dialog') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. # https://www.dropbox.com/s/4i9u4y24pt3paba/personalized-dialog-dataset.tar.gz?dl=1 fname = 'personalized-dialog-dataset.tar.gz' url = 'https://www.dropbox.com/s/4i9u4y24pt3paba/' + fname + '?dl=1' build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'SQuAD') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname1 = 'train-v1.1.json' fname2 = 'dev-v1.1.json' url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/' build_data.download(url + fname1, dpath, fname1) build_data.download(url + fname2, dpath, fname2) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'OpenSubtitles') version = '1' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. url = ('http://opus.lingfil.uu.se/download.php?f=OpenSubtitles/en.tar.gz') build_data.download(url, dpath, 'OpenSubtitles.tar.gz') build_data.untar(dpath, 'OpenSubtitles.tar.gz') create_fb_format(os.path.join(dpath, 'OpenSubtitles', 'en'), dpath) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): data_path = os.path.join(opt['datapath'], 'DialogueQE') version = '1501534800' if not build_data.built(data_path, version_string=version): print('[building data: ' + data_path + ']') if build_data.built(data_path): build_data.remove_dir(data_path) build_data.make_dir(data_path) fname = 'data_' + version + '.tar.gz' url = 'https://raw.githubusercontent.com/deepmipt/turing-data/master/' + fname build_data.download(url, data_path, fname) build_data.untar(data_path, fname) os.rename(os.path.join(data_path, 'data_train_' + version + '.json'), os.path.join(data_path, 'train.json')) os.rename(os.path.join(data_path, 'data_test_' + version + '.json'), os.path.join(data_path, 'test.json')) build_data.mark_done(data_path, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'CLEVR') version = 'v1.0' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') # An older version exists, so remove these outdated files. if build_data.built(dpath): build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'CLEVR_v1.0.zip' url = 'https://s3-us-west-1.amazonaws.com/clevr/' build_data.download(url + fname, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): # Depends upon another dataset, wikimovies, build that first. wikimovies_build.build(opt) dpath = os.path.join(opt['datapath'], 'DBLL') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'dbll.tgz' url = 'https://s3.amazonaws.com/fair-data/parlai/dbll/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'COPA') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # an older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # download the data. fname = 'COPA-resources.tgz' # dataset URL url = 'http://people.ict.usc.edu/~gordon/downloads/' + fname build_data.download(url, dpath, fname) # uncompress it build_data.untar(dpath, fname) # mark the data as built build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'negotiation') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') # make a clean directory if needed if build_data.built(dpath): # an older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data from github fname = 'negotiation.zip' url = ('https://github.com/facebookresearch/end-to-end-negotiator/' 'archive/master.zip') print('[downloading data from: ' + url + ']') build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark as done build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'InsuranceQA') version = '1' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data from github. fname = 'insuranceqa.zip' url = 'https://github.com/shuzi/insuranceQA/archive/master.zip' print('[downloading data from: ' + url + ']') build_data.download(url, dpath, fname) build_data.untar(dpath, fname) ParseInsuranceQAV1.build(dpath) ParseInsuranceQAV2.build(dpath) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): version = 'v1.0' dpath = os.path.join(opt['datapath'], 'SCAN') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'scan.tgz' url = 'https://s3.amazonaws.com/fair-data/parlai/scan/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) ext = os.path.join('dailymail', 'questions') create_fb_format(dpath, 'train', os.path.join(dpath, 'tasks_train_simple.txt')) create_fb_format(dpath, 'valid', os.path.join(dpath, 'tasks_train_simple.txt')) create_fb_format(dpath, 'test', os.path.join(dpath, 'tasks_test_simple.txt')) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): dpath = os.path.join(opt['datapath'], 'CornellMovie') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'cornell_movie_dialogs_corpus.zip' url = 'http://www.mpi-sws.org/~cristian/data/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) dpext = os.path.join(dpath, 'cornell movie-dialogs corpus') create_fb_format(os.path.join(dpext, 'movie_lines.txt'), os.path.join(dpext, 'movie_conversations.txt'), dpath) # Mark the data as built. build_data.mark_done(dpath, version_string=version)