def build(opt): version = '0.2' dpath = os.path.join(opt['datapath'], 'ConvAI2_wild_evaluation') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. for downloadable_file in RESOURCES: downloadable_file.download_file(dpath) output_fname = 'convai2_wild_evaluation.json' output_path = os.path.join(dpath, output_fname) with PathManager.open(output_path, 'r') as data_f: data = json.load(data_f) make_parlai_format(data, dpath) PathManager.rm(output_path) # Mark the data as built. build_data.mark_done(dpath, version)
def _unzip(path, fname, delete=True): """ Unpack the given zip file to the same directory. :param str path: The folder containing the archive. Will contain the contents. :param str fname: The filename of the archive file. :param bool delete: If true, the archive will be deleted after extraction. """ import zipfile logging.debug(f'unpacking {fname}') fullpath = os.path.join(path, fname) with zipfile.ZipFile(PathManager.open(fullpath, 'rb'), 'r') as zf: for member in zf.namelist(): outpath = os.path.join(path, member) if zf.getinfo(member).is_dir(): logging.debug(f"Making directory {outpath}") PathManager.mkdirs(outpath) continue logging.debug(f"Extracting to {outpath}") with zf.open(member, 'r') as inf, PathManager.open(outpath, 'wb') as outf: shutil.copyfileobj(inf, outf) if delete: try: PathManager.rm(fullpath) except PermissionError: logging.error( f"Tried to delete {fullpath} but got a permission error. This " "is known to happen in Windows and is probably not fatal.")
def _unzip(path, fname, delete=True): """ Unpack the given zip file to the same directory. :param str path: The folder containing the archive. Will contain the contents. :param str fname: The filename of the archive file. :param bool delete: If true, the archive will be deleted after extraction. """ import zipfile logging.debug(f'unpacking {fname}') fullpath = os.path.join(path, fname) with zipfile.ZipFile(PathManager.open(fullpath, 'rb'), 'r') as zf: for member in zf.namelist(): outpath = os.path.join(path, member) if zf.getinfo(member).is_dir(): logging.debug(f"Making directory {outpath}") PathManager.mkdirs(outpath) continue logging.debug(f"Extracting to {outpath}") with zf.open(member, 'r') as inf, PathManager.open(outpath, 'wb') as outf: shutil.copyfileobj(inf, outf) if delete: PathManager.rm(fullpath)
def build(opt): version = 'v1.1' dpath = os.path.join(opt['datapath'], 'Twitter') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. for downloadable_file in RESOURCES: downloadable_file.download_file(dpath) file1 = os.path.join(dpath, RESOURCES[0].file_name) file2 = os.path.join(dpath, RESOURCES[1].file_name) concat = io.BytesIO() for fn in [file1, file2]: with PathManager.open(fn, 'rb') as rawf: concat.write(rawf.read()) with gzip.GzipFile(fileobj=io.BytesIO(concat.getvalue())) as f: file_content = bytes.decode(f.read()) data = file_content.split('\n')[2:] create_fb_format(data, dpath) PathManager.rm(file1) PathManager.rm(file2) # Mark the data as built. build_data.mark_done(dpath, version)
def setUp(self): self.datapath = ParlaiParser().parse_args([])['datapath'] self.datapath = os.path.join(self.datapath, 'build_data_pyt_data') PathManager.mkdirs(self.datapath) for d in self.dest_filenames: # Removing files if they are already there b/c otherwise it won't try to download them again try: PathManager.rm(os.path.join(self.datapath, d)) except OSError: pass
def build(opt): version = 'v0.9' dpath = os.path.join(opt['datapath'], 'VisDial-v0.9') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. for downloadable_file in RESOURCES: downloadable_file.download_file(dpath) print('processing unpacked files') # Use 1000 examples from training set as validation. json1 = os.path.join( dpath, RESOURCES[0].file_name.rsplit('.', 1)[0] + '.json') with PathManager.open(json1) as t_json: train_data = json.load(t_json) valid_data = train_data.copy() valid_data['data'] = train_data['data'].copy() valid_data['data']['dialogs'] = [] # Use constant stride to pick examples. num_valid = 1000 total = len(train_data['data']['dialogs']) step = total // (num_valid - 1) for i in range(total - 1, 0, -step)[:num_valid]: valid_data['data']['dialogs'].append( train_data['data']['dialogs'][i]) del train_data['data']['dialogs'][i] train_json = json1.rsplit('.', 1)[0] + '_train.json' valid_json = json1.rsplit('.', 1)[0] + '_valid.json' with PathManager.open(train_json, 'w') as t_out, PathManager.open(valid_json, 'w') as v_out: json.dump(train_data, t_out) json.dump(valid_data, v_out) PathManager.rm(json1) # Use validation data as test. json2 = os.path.join( dpath, RESOURCES[1].file_name.rsplit('.', 1)[0] + '.json') test_json = json2.rsplit('.', 1)[0] + '_test.json' build_data.move(json2, test_json) # Mark the data as built. build_data.mark_done(dpath, version)
def _untar(path, fname, delete=True, flatten=False): """ Unpack the given archive file to the same directory. :param str path: The folder containing the archive. Will contain the contents. :param str fname: The filename of the archive file. :param bool delete: If true, the archive will be deleted after extraction. """ import tarfile logging.debug(f'unpacking {fname}') fullpath = os.path.join(path, fname) # very painfully manually extract files so that we can use PathManger.open # instead, lest we are using fb internal file services with tarfile.open(fileobj=PathManager.open(fullpath, 'rb')) as tf: for item in tf: item_name = item.name while item_name.startswith("./"): # internal file systems will actually create a literal "." # directory, so we gotta watch out for that item_name = item_name[2:] if flatten: # flatten the tar file if there are subdirectories fn = os.path.join(path, os.path.split(item_name)[-1]) else: fn = os.path.join(path, item_name) logging.debug(f"Extracting to {fn}") if item.isdir(): PathManager.mkdirs(fn) elif item.isfile(): with PathManager.open(fn, 'wb') as wf, tf.extractfile( item.name) as rf: tarfile.copyfileobj(rf, wf) else: raise NotImplementedError( "No support for symlinks etc. right now.") if delete: try: PathManager.rm(fullpath) except PermissionError: logging.error( f"Tried to delete {fullpath} but got a permission error. This " "is known to happen in Windows and is probably not fatal.")
def _untar(path, fname, delete=True): """ Unpack the given archive file to the same directory. :param str path: The folder containing the archive. Will contain the contents. :param str fname: The filename of the archive file. :param bool delete: If true, the archive will be deleted after extraction. """ import tarfile logging.debug(f'unpacking {fname}') fullpath = os.path.join(path, fname) # very painfully manually extract files so that we can use PathManger.open # instead, lest we are using fb internal file services with tarfile.open(fileobj=PathManager.open(fullpath, 'rb')) as tf: for item in tf: item_name = item.name while item_name.startswith("./"): # internal file systems will actually create a literal "." # directory, so we gotta watch out for that item_name = item_name[2:] fn = os.path.join(path, item_name) logging.debug(f"Extracting to {fn}") if item.isdir(): PathManager.mkdirs(fn) elif item.isfile(): with PathManager.open(fn, 'wb') as wf, tf.extractfile( item.name) as rf: tarfile.copyfileobj(rf, wf) else: raise NotImplementedError( "No support for symlinks etc. right now.") if delete: PathManager.rm(fullpath)
def test_set_model_file_without_dict_file(self): """ Check that moving a model without moving the dictfile raises an error. """ # Download model, move to a new location with testing_utils.tempdir() as datapath: try: # remove unittest models if there before shutil.rmtree(os.path.join(datapath, 'models/unittest')) except FileNotFoundError: pass zoo_path = 'zoo:unittest/seq2seq/model' model_path = modelzoo_path(datapath, zoo_path) PathManager.rm(model_path + '.dict') # Test that eval model fails with self.assertRaises(RuntimeError): testing_utils.eval_model( dict(task='babi:task1k:1', model_file=model_path)) try: # remove unittest models if there after shutil.rmtree(os.path.join(datapath, 'models/unittest')) except FileNotFoundError: pass
def build(opt): dpath = os.path.join(opt['datapath'], 'AmazonQA') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # an older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. for downloadable_file in RESOURCES: downloadable_file.download_file(dpath) new_filename = downloadable_file.file_name[:-3] print('[ unpacking data: ' + downloadable_file.file_name + ' ]') f = open(dpath + '/' + new_filename, 'w') for l in parse_gzip(dpath + '/' + downloadable_file.file_name): f.write(l + '\n') PathManager.rm(dpath + '/' + downloadable_file.file_name) # mark the data as built build_data.mark_done(dpath, version_string=version)
def _download_images(self, opt: Opt): """ Download available IGC images. """ urls = [] ids = [] for dt in ['test', 'val']: df = os.path.join(self.get_data_path(opt), f'IGC_crowd_{dt}.csv') with PathManager.open(df, newline='\n') as csv_file: reader = csv.reader(csv_file, delimiter=',') fields = [] for i, row in enumerate(reader): if i == 0: fields = row else: data = dict(zip(fields, row)) urls.append(data['url']) ids.append(data['id']) PathManager.mkdirs(self.get_image_path(opt)) # Make one blank image image = Image.new('RGB', (100, 100), color=0) image.save(os.path.join(self.get_image_path(opt), self.blank_image_id), 'JPEG') # Download the rest download_multiprocess(urls, self.get_image_path(opt), dest_filenames=ids) # Remove bad images for fp in os.listdir(self.get_image_path(opt)): img_path = os.path.join(self.get_image_path(opt), fp) if PathManager.exists(img_path): try: Image.open(img_path).convert('RGB') except OSError: PathManager.rm(img_path)
def download_and_process(file_url, mode, subreddit_names, st_time, output_dir): # download and pre-process original posts reddit_tmp_dir = pjoin(output_dir, 'reddit_tmp') f_name = pjoin(reddit_tmp_dir, file_url.split('/')[-1]) tries_left = 4 # open monthly dumps and download lines in posts while tries_left: try: print("downloading %s %2f" % (f_name, time() - st_time)) subprocess.run( ['wget', '-P', reddit_tmp_dir, file_url], stdout=subprocess.PIPE ) print("decompressing and filtering %s %2f" % (f_name, time() - st_time)) if f_name.split('.')[-1] == 'xz': f = lzma.open(f_name, 'rt') elif f_name.split('.')[-1] == 'bz2': f = bz2.open(f_name, 'rt') elif f_name.split('.')[-1] == 'zst': fh = open(f_name, 'rb') dctx = zstd.ZstdDecompressor() stream_reader = dctx.stream_reader(fh) f = io.TextIOWrapper(stream_reader, encoding='utf-8') lines = dict([(name, []) for name in subreddit_names]) for i, l in enumerate(f): if i % 1000000 == 0: print( "read %d lines, found %d" % (i, sum([len(ls) for ls in lines.values()])), time() - st_time, ) for name in subreddit_names: subreddit_field = f'"subreddit":"{name}"' if subreddit_field in l: lines[name] += [l.strip()] if f_name.split('.')[-1] == 'zst': fh.close() else: f.close() PathManager.rm(f_name) tries_left = 0 except EOFError: sleep(10) print( "failed reading file %s file, another %d tries" % (f_name, tries_left) ) PathManager.rm(f_name) tries_left -= 1 print("tokenizing and selecting %s %2f" % (f_name, time() - st_time)) processed_items = dict([(name, []) for name in subreddit_names]) if mode == 'submissions': key_list = ['id', 'score', 'url', 'title', 'selftext'] else: key_list = ['id', 'link_id', 'parent_id', 'score', 'body'] for name in subreddit_names: for line in lines[name]: reddit_dct = json.loads(line) if ( reddit_dct.get('num_comments', 1) > 0 and reddit_dct.get('score', 0) and reddit_dct.get('score', 0) >= 2 and (mode == 'submissions' or valid_comment(reddit_dct)) ): reddit_res = {} for k in key_list: if k in ['title', 'selftext', 'body']: if reddit_dct[k].lower() in ['[removed]', '[deleted]']: reddit_dct[k] = '' txt, url_list = word_url_tokenize(reddit_dct[k]) reddit_res[k] = (' '.join(txt.split()), url_list) else: reddit_res[k] = reddit_dct[k] processed_items[name] += [reddit_res] print("Total found %d" % (len(processed_items)), time() - st_time) return processed_items