def collect_experiment(key, root_uri, output_dir, get_pred_package=False): print('\nCollecting experiment {}...\n'.format(key)) if root_uri.startswith('s3://'): predict_package_uris = list_paths(join(root_uri, key, 'bundle'), ext='predict_package.zip') eval_json_uris = list_paths(join(root_uri, key, 'eval'), ext='eval.json') else: predict_package_uris = glob.glob(join(root_uri, key, 'bundle', '*', 'predict_package.zip')) eval_json_uris = glob.glob(join(root_uri, key, 'eval', '*', 'eval.json')) if len(predict_package_uris) > 1 or len(eval_json_uris) > 1: print('Cannot collect from key with multiple experiments!!!') return if len(predict_package_uris) == 0 or len(eval_json_uris) == 0: print('Missing output!!!') return predict_package_uri = predict_package_uris[0] eval_json_uri = eval_json_uris[0] make_dir(join(output_dir, key)) if get_pred_package: download_or_copy(predict_package_uri, join(output_dir, key)) download_or_copy(eval_json_uri, join(output_dir, key)) eval_json = file_to_json(join(output_dir, key, 'eval.json')) pprint.pprint(eval_json['overall'], indent=4)
def test_list_paths_s3(self): path = os.path.join(self.tmp_dir.name, 'lorem', 'ipsum.txt') s3_path = 's3://{}/xxx/lorem.txt'.format(self.bucket_name) s3_directory = 's3://{}/xxx/'.format(self.bucket_name) directory = os.path.dirname(path) make_dir(directory, check_empty=False) str_to_file(self.lorem, path) upload_or_copy(path, s3_path) list_paths(s3_directory) self.assertEqual(len(list_paths(s3_directory)), 1)
def unzip_data(self) -> List[str]: """Unzip dataset zip files. Returns: paths to directories that each contain contents of one zip file """ cfg = self.cfg if cfg.data.uri.startswith('s3://') or cfg.data.uri.startswith('/'): data_uri = cfg.data.uri else: data_uri = join(cfg.base_uri, cfg.data.uri) data_dirs = [] zip_uris = [data_uri] if data_uri.endswith('.zip') else list_paths( data_uri, 'zip') for zip_ind, zip_uri in enumerate(zip_uris): zip_path = get_local_path(zip_uri, self.data_cache_dir) if not isfile(zip_path): zip_path = download_if_needed(zip_uri, self.data_cache_dir) with zipfile.ZipFile(zip_path, 'r') as zipf: data_dir = join(self.tmp_dir, 'data', str(zip_ind)) data_dirs.append(data_dir) zipf.extractall(data_dir) return data_dirs
def collect_eval_dir(root_uri): eval_json_uris = list_paths(join(root_uri, 'eval'), ext='eval.json') for eval_json_uri in eval_json_uris: eval_json = file_to_json(eval_json_uri) print(basename(dirname(eval_json_uri))) print(eval_json['overall'][-1]['f1']) print()
def unzip_data(self, uri: Union[str, List[str]]) -> List[str]: """Unzip dataset zip files. Args: uri: a list of URIs of zip files or the URI of a directory containing zip files Returns: paths to directories that each contain contents of one zip file """ cfg = self.cfg data_dirs = [] if isinstance(uri, list): zip_uris = uri else: # TODO generalize this to work with any file system if uri.startswith('s3://') or uri.startswith('/'): data_uri = uri else: data_uri = join(cfg.base_uri, uri) zip_uris = ([data_uri] if data_uri.endswith('.zip') else list_paths(data_uri, 'zip')) for zip_ind, zip_uri in enumerate(zip_uris): zip_path = get_local_path(zip_uri, self.data_cache_dir) if not isfile(zip_path): zip_path = download_if_needed(zip_uri, self.data_cache_dir) with zipfile.ZipFile(zip_path, 'r') as zipf: data_dir = join(self.tmp_dir, 'data', str(uuid.uuid4()), str(zip_ind)) data_dirs.append(data_dir) zipf.extractall(data_dir) return data_dirs
def get_scene_ids(self): label_dir = os.path.join(self.raw_uri, self.base_dir, self.label_dir) label_paths = list_paths(label_dir, ext='.geojson') label_re = re.compile(r'.*{}(\d+)\.geojson'.format( self.label_fn_prefix)) scene_ids = [ label_re.match(label_path).group(1) for label_path in label_paths ] return scene_ids
def test_sync_from_dir_noop_local(self): path = os.path.join(self.tmp_dir.name, 'lorem', 'ipsum.txt') src = os.path.join(self.tmp_dir.name, 'lorem') make_dir(src, check_empty=False) fs = FileSystem.get_file_system(src, 'r') fs.write_bytes(path, bytes([0x00, 0x01])) sync_from_dir(src, src, delete=True) self.assertEqual(len(list_paths(src)), 1)
def test_copy_to_local(self): path1 = os.path.join(self.tmp_dir.name, 'lorem', 'ipsum.txt') path2 = os.path.join(self.tmp_dir.name, 'yyy', 'ipsum.txt') dir1 = os.path.dirname(path1) dir2 = os.path.dirname(path2) make_dir(dir1, check_empty=False) make_dir(dir2, check_empty=False) str_to_file(self.lorem, path1) upload_or_copy(path1, path2) self.assertEqual(len(list_paths(dir2)), 1)
def test_sync_to_dir_local(self): path = os.path.join(self.tmp_dir.name, 'lorem', 'ipsum.txt') src = os.path.dirname(path) dst = os.path.join(self.tmp_dir.name, 'xxx') make_dir(src, check_empty=False) make_dir(dst, check_empty=False) fs = FileSystem.get_file_system(path, 'r') fs.write_bytes(path, bytes([0x00, 0x01])) sync_to_dir(src, dst, delete=True) self.assertEqual(len(list_paths(dst)), 1)
def setup_data(self): cfg = self.cfg batch_sz = cfg.solver.batch_sz num_workers = cfg.data.num_workers # download and unzip data if cfg.data.uri.startswith('s3://') or cfg.data.uri.startswith('/'): data_uri = cfg.data.uri else: data_uri = join(cfg.base_uri, cfg.data.uri) data_dirs = [] zip_uris = [data_uri] if data_uri.endswith('.zip') else list_paths( data_uri, 'zip') for zip_ind, zip_uri in enumerate(zip_uris): zip_path = get_local_path(zip_uri, self.data_cache_dir) if not isfile(zip_path): zip_path = download_if_needed(zip_uri, self.data_cache_dir) with zipfile.ZipFile(zip_path, 'r') as zipf: data_dir = join(self.tmp_dir, 'data', str(zip_ind)) data_dirs.append(data_dir) zipf.extractall(data_dir) # build datasets -- one per zip file and then merge them into a single dataset train_ds = [] valid_ds = [] test_ds = [] for data_dir in data_dirs: train_dir = join(data_dir, 'train') valid_dir = join(data_dir, 'valid') transform = Compose( [Resize((cfg.data.img_sz, cfg.data.img_sz)), ToTensor()]) aug_transform = Compose([ RandomHorizontalFlip(), RandomVerticalFlip(), ColorJitter(0.1, 0.1, 0.1, 0.1), Resize((cfg.data.img_sz, cfg.data.img_sz)), ToTensor() ]) if isdir(train_dir): if cfg.overfit_mode: train_ds.append( ImageRegressionDataset(train_dir, cfg.data.class_names, transform=transform)) else: train_ds.append( ImageRegressionDataset(train_dir, cfg.data.class_names, transform=aug_transform)) if isdir(valid_dir): valid_ds.append( ImageRegressionDataset(valid_dir, cfg.data.class_names, transform=transform)) test_ds.append( ImageRegressionDataset(valid_dir, cfg.data.class_names, transform=transform)) train_ds, valid_ds, test_ds = \ ConcatDataset(train_ds), ConcatDataset(valid_ds), ConcatDataset(test_ds) if cfg.overfit_mode: train_ds = Subset(train_ds, range(batch_sz)) valid_ds = train_ds test_ds = train_ds elif cfg.test_mode: train_ds = Subset(train_ds, range(batch_sz)) valid_ds = Subset(valid_ds, range(batch_sz)) test_ds = Subset(test_ds, range(batch_sz)) train_dl = DataLoader(train_ds, shuffle=True, batch_size=batch_sz, num_workers=num_workers, pin_memory=True) valid_dl = DataLoader(valid_ds, shuffle=True, batch_size=batch_sz, num_workers=num_workers, pin_memory=True) test_dl = DataLoader(test_ds, shuffle=True, batch_size=batch_sz, num_workers=num_workers, pin_memory=True) self.train_ds, self.valid_ds, self.test_ds = (train_ds, valid_ds, test_ds) self.train_dl, self.valid_dl, self.test_dl = (train_dl, valid_dl, test_dl)