def build(opt): dpath = opt['datapath'] + "/WebQuestions/" if not build_data.built(dpath): print("[building data: " + dpath + "]") build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. url = ("https://worksheets.codalab.org/rest/bundles/" + "0x4a763f8cde224c2da592b75f29e2f5c2/contents/blob/") build_data.download(dpath, url) build_data.move(dpath + 'index.html', dpath + 'train.json') url = ("https://worksheets.codalab.org/rest/bundles/" + "0xe7bac352fce7448c9ef238fb0a297ec2/contents/blob/") build_data.download(dpath, url) build_data.move(dpath + 'index.html', dpath + 'test.json') create_fb_format(dpath, 'train', dpath + 'train.json') create_fb_format(dpath, 'valid', dpath + 'train.json') create_fb_format(dpath, 'test', dpath + 'test.json') # Mark the data as built. build_data.mark_done(dpath)
def train_test_split(inpath, train, test, split, random_seed): """ RuCor doesn't provide train/test data splitting, it makes random splitting. Args: inpath: path to data train: path to train folder test: path to test folder split: int, split ratio random_seed: seed for random module Returns: """ print('Start train-test splitting ...') z = os.listdir(inpath) doc_split = ShuffleSplit(1, test_size=split, random_state=random_seed) for train_indeses, test_indeses in doc_split.split(z): train_set = [z[i] for i in sorted(list(train_indeses))] test_set = [z[i] for i in sorted(list(test_indeses))] for x in train_set: build_data.move(os.path.join(inpath, x), os.path.join(train, x)) for x in test_set: build_data.move(os.path.join(inpath, x), os.path.join(test, x)) print('End train-test splitts.') return None
def build(opt): version = 'v0.9' dpath = os.path.join(opt['datapath'], 'VisDial-v0.9') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname1 = 'visdial_0.9_train.zip' fname2 = 'visdial_0.9_val.zip' url = 'https://computing.ece.vt.edu/~abhshkdz/data/visdial/' build_data.download(url + fname1, dpath, fname1) build_data.download(url + fname2, dpath, fname2) build_data.untar(dpath, fname1) build_data.untar(dpath, fname2) print('processing unpacked files') # Use 1000 examples from training set as validation. json1 = os.path.join(dpath, fname1.rsplit('.', 1)[0] + '.json') with open(json1) as t_json: train_data = json.load(t_json) valid_data = train_data.copy() valid_data['data'] = train_data['data'].copy() valid_data['data']['dialogs'] = [] # Use constant stride to pick examples. num_valid = 1000 total = len(train_data['data']['dialogs']) step = total // (num_valid - 1) for i in range(total - 1, 0, -step)[:num_valid]: valid_data['data']['dialogs'].append( train_data['data']['dialogs'][i]) del train_data['data']['dialogs'][i] train_json = json1.rsplit('.', 1)[0] + '_train.json' valid_json = json1.rsplit('.', 1)[0] + '_valid.json' with open(train_json, 'w') as t_out, open(valid_json, 'w') as v_out: json.dump(train_data, t_out) json.dump(valid_data, v_out) os.remove(json1) # Use validation data as test. json2 = os.path.join(dpath, fname2.rsplit('.', 1)[0] + '.json') test_json = json2.rsplit('.', 1)[0] + '_test.json' build_data.move(json2, test_json) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): version = 'v0.9' dpath = os.path.join(opt['datapath'], 'VisDial-v0.9') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname1 = 'visdial_0.9_train.zip' fname2 = 'visdial_0.9_val.zip' url = 'https://computing.ece.vt.edu/~abhshkdz/data/visdial/' build_data.download(url + fname1, dpath, fname1) build_data.download(url + fname2, dpath, fname2) build_data.untar(dpath, fname1) build_data.untar(dpath, fname2) print('processing unpacked files') # Use 1000 examples from training set as validation. json1 = os.path.join(dpath, fname1.rsplit('.', 1)[0] + '.json') with open(json1) as t_json: train_data = json.load(t_json) valid_data = train_data.copy() valid_data['data'] = train_data['data'].copy() valid_data['data']['dialogs'] = [] # Use constant stride to pick examples. num_valid = 1000 total = len(train_data['data']['dialogs']) step = total // (num_valid - 1) for i in range(total-1, 0, -step)[:num_valid]: valid_data['data']['dialogs'].append(train_data['data']['dialogs'][i]) del train_data['data']['dialogs'][i] train_json = json1.rsplit('.', 1)[0] + '_train.json' valid_json = json1.rsplit('.', 1)[0] + '_valid.json' with open(train_json, 'w') as t_out, open(valid_json, 'w') as v_out: json.dump(train_data, t_out) json.dump(valid_data, v_out) os.remove(json1) # Use validation data as test. json2 = os.path.join(dpath, fname2.rsplit('.', 1)[0] + '.json') test_json = json2.rsplit('.', 1)[0] + '_test.json' build_data.move(json2, test_json) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): version = 'v0.9' dpath = os.path.join(opt['datapath'], 'VisDial-v0.9') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. for downloadable_file in RESOURCES: downloadable_file.download_file(dpath) print('processing unpacked files') # Use 1000 examples from training set as validation. json1 = os.path.join( dpath, RESOURCES[0].file_name.rsplit('.', 1)[0] + '.json') with PathManager.open(json1) as t_json: train_data = json.load(t_json) valid_data = train_data.copy() valid_data['data'] = train_data['data'].copy() valid_data['data']['dialogs'] = [] # Use constant stride to pick examples. num_valid = 1000 total = len(train_data['data']['dialogs']) step = total // (num_valid - 1) for i in range(total - 1, 0, -step)[:num_valid]: valid_data['data']['dialogs'].append( train_data['data']['dialogs'][i]) del train_data['data']['dialogs'][i] train_json = json1.rsplit('.', 1)[0] + '_train.json' valid_json = json1.rsplit('.', 1)[0] + '_valid.json' with PathManager.open(train_json, 'w') as t_out, PathManager.open(valid_json, 'w') as v_out: json.dump(train_data, t_out) json.dump(valid_data, v_out) PathManager.rm(json1) # Use validation data as test. json2 = os.path.join( dpath, RESOURCES[1].file_name.rsplit('.', 1)[0] + '.json') test_json = json2.rsplit('.', 1)[0] + '_test.json' build_data.move(json2, test_json) # Mark the data as built. build_data.mark_done(dpath, version)
def train_test_split(inpath, train, test, split, random_seed): print('Start train-test splitting ...') z = os.listdir(inpath) doc_split = ShuffleSplit(1, test_size=split, random_state=random_seed) for train_indeses, test_indeses in doc_split.split(z): train_set = [z[i] for i in sorted(list(train_indeses))] test_set = [z[i] for i in sorted(list(test_indeses))] for x in train_set: build_data.move(os.path.join(inpath, x), os.path.join(train, x)) for x in test_set: build_data.move(os.path.join(inpath, x), os.path.join(test, x)) print('End train-test splitts.') return None
def train_valid_test_split(inpath, train_path, valid_path, test_path, valid_ratio, test_ratio, seed=None): """split dataset on train/valid/test splits dataset and moves datafiles to train/valid/test folders Args: inpath: all datafiles train_path: path to save train datafiles valid_path: path to save valid datafiles test_path: path to save test datafiles valid_ratio: len(valid) / len(all_datafiles) test_ratio: len(test) / len(all_datafiles) seed: random seed Returns: nothing """ assert valid_ratio + test_ratio <= 1.0 assert valid_ratio > 0 and test_ratio > 0 source_files = list(sorted(os.listdir(inpath))) train_valid, test = train_test_split(source_files, test_size=test_ratio, random_state=seed) train, valid = train_test_split(train_valid, test_size=valid_ratio / (1 - test_ratio), random_state=seed) print('train_valid_test_split: {}/{}/{}'.format(len(train), len(valid), len(test))) for dataset, data_path in zip([train, valid, test], [train_path, valid_path, test_path]): for el in dataset: build_data.move(os.path.join(inpath, el), os.path.join(data_path, el)) return None
def build(opt): dpath = opt['datapath'] + "/MovieDialog/" if not build_data.built(dpath): print("[building data: " + dpath + "]") build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = "moviedialog.tar.gz" url = "https://s3.amazonaws.com/fair-data/parlai/moviedialog/" + fname build_data.download(dpath, url) build_data.untar(dpath, fname) dpath2 = dpath + "/movie_dialog_dataset/task4_reddit/" fname2a = dpath2 + "p6tyohj" fname2b = dpath2 + "p6tyohj.tgz" url2 = "http://tinyurl.com/" + "p6tyohj" build_data.download(dpath2, url2) build_data.move(fname2a, fname2b) build_data.untar(dpath2, "p6tyohj.tgz") # Mark the data as built. build_data.mark_done(dpath)
def build(opt): dpath = os.path.join(opt['datapath'], 'MovieDialog') if not build_data.built(dpath): print('[building data: ' + dpath + ']') build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'moviedialog.tar.gz' url = 'https://s3.amazonaws.com/fair-data/parlai/moviedialog/' + fname build_data.download(dpath, url) build_data.untar(dpath, fname) dpath2 = os.path.join(dpath, 'movie_dialog_dataset', 'task4_reddit') fname2a = os.path.join(dpath2, 'p6tyohj') fname2b = os.path.join(dpath2, 'p6tyohj.tgz') url2 = 'http://tinyurl.com/' + 'p6tyohj' build_data.download(dpath2, url2) build_data.move(fname2a, fname2b) build_data.untar(dpath2, 'p6tyohj.tgz') # Mark the data as built. build_data.mark_done(dpath)
def build(opt): dpath = os.path.join(opt['datapath'], 'dstc2') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove there outdates files. build_data.remove_dir(dpath) build_data.make_dir(dpath) ds_path = os.environ.get('DATASETS_URL') filename = 'dstc2.tar.gz' # Download the data. print('Trying to download a dataset %s from the repository' % filename) url = urllib.parse.urljoin(ds_path, filename) if url.startswith('file://'): build_data.move(url[7:], dpath) else: build_data.download(url, dpath, filename) build_data.untar(dpath, filename) # Mark the data as built. build_data.mark_done(dpath, version_string=version)