Exemple #1
0
def collect_experiment(key, root_uri, output_dir, get_pred_package=False):
    print('\nCollecting experiment {}...\n'.format(key))

    if root_uri.startswith('s3://'):
        predict_package_uris = list_paths(join(root_uri, key, 'bundle'), ext='predict_package.zip')
        eval_json_uris = list_paths(join(root_uri, key, 'eval'), ext='eval.json')
    else:
        predict_package_uris = glob.glob(join(root_uri, key, 'bundle', '*', 'predict_package.zip'))
        eval_json_uris = glob.glob(join(root_uri, key, 'eval', '*', 'eval.json'))

    if len(predict_package_uris) > 1 or len(eval_json_uris) > 1:
        print('Cannot collect from key with multiple experiments!!!')
        return

    if len(predict_package_uris) == 0 or len(eval_json_uris) == 0:
        print('Missing output!!!')
        return

    predict_package_uri = predict_package_uris[0]
    eval_json_uri = eval_json_uris[0]
    make_dir(join(output_dir, key))
    if get_pred_package:
        download_or_copy(predict_package_uri, join(output_dir, key))

    download_or_copy(eval_json_uri, join(output_dir, key))

    eval_json = file_to_json(join(output_dir, key, 'eval.json'))
    pprint.pprint(eval_json['overall'], indent=4)
    def test_list_paths_s3(self):
        path = os.path.join(self.tmp_dir.name, 'lorem', 'ipsum.txt')
        s3_path = 's3://{}/xxx/lorem.txt'.format(self.bucket_name)
        s3_directory = 's3://{}/xxx/'.format(self.bucket_name)
        directory = os.path.dirname(path)
        make_dir(directory, check_empty=False)

        str_to_file(self.lorem, path)
        upload_or_copy(path, s3_path)

        list_paths(s3_directory)
        self.assertEqual(len(list_paths(s3_directory)), 1)
    def unzip_data(self) -> List[str]:
        """Unzip dataset zip files.

        Returns:
            paths to directories that each contain contents of one zip file
        """
        cfg = self.cfg
        if cfg.data.uri.startswith('s3://') or cfg.data.uri.startswith('/'):
            data_uri = cfg.data.uri
        else:
            data_uri = join(cfg.base_uri, cfg.data.uri)

        data_dirs = []
        zip_uris = [data_uri] if data_uri.endswith('.zip') else list_paths(
            data_uri, 'zip')
        for zip_ind, zip_uri in enumerate(zip_uris):
            zip_path = get_local_path(zip_uri, self.data_cache_dir)
            if not isfile(zip_path):
                zip_path = download_if_needed(zip_uri, self.data_cache_dir)
            with zipfile.ZipFile(zip_path, 'r') as zipf:
                data_dir = join(self.tmp_dir, 'data', str(zip_ind))
                data_dirs.append(data_dir)
                zipf.extractall(data_dir)

        return data_dirs
Exemple #4
0
def collect_eval_dir(root_uri):
    eval_json_uris = list_paths(join(root_uri, 'eval'), ext='eval.json')
    for eval_json_uri in eval_json_uris:
        eval_json = file_to_json(eval_json_uri)
        print(basename(dirname(eval_json_uri)))
        print(eval_json['overall'][-1]['f1'])
        print()
Exemple #5
0
    def unzip_data(self, uri: Union[str, List[str]]) -> List[str]:
        """Unzip dataset zip files.

        Args:
            uri: a list of URIs of zip files or the URI of a directory containing
                zip files

        Returns:
            paths to directories that each contain contents of one zip file
        """
        cfg = self.cfg
        data_dirs = []

        if isinstance(uri, list):
            zip_uris = uri
        else:
            # TODO generalize this to work with any file system
            if uri.startswith('s3://') or uri.startswith('/'):
                data_uri = uri
            else:
                data_uri = join(cfg.base_uri, uri)
            zip_uris = ([data_uri] if data_uri.endswith('.zip') else
                        list_paths(data_uri, 'zip'))

        for zip_ind, zip_uri in enumerate(zip_uris):
            zip_path = get_local_path(zip_uri, self.data_cache_dir)
            if not isfile(zip_path):
                zip_path = download_if_needed(zip_uri, self.data_cache_dir)
            with zipfile.ZipFile(zip_path, 'r') as zipf:
                data_dir = join(self.tmp_dir, 'data', str(uuid.uuid4()),
                                str(zip_ind))
                data_dirs.append(data_dir)
                zipf.extractall(data_dir)

        return data_dirs
 def get_scene_ids(self):
     label_dir = os.path.join(self.raw_uri, self.base_dir, self.label_dir)
     label_paths = list_paths(label_dir, ext='.geojson')
     label_re = re.compile(r'.*{}(\d+)\.geojson'.format(
         self.label_fn_prefix))
     scene_ids = [
         label_re.match(label_path).group(1) for label_path in label_paths
     ]
     return scene_ids
    def test_sync_from_dir_noop_local(self):
        path = os.path.join(self.tmp_dir.name, 'lorem', 'ipsum.txt')
        src = os.path.join(self.tmp_dir.name, 'lorem')
        make_dir(src, check_empty=False)

        fs = FileSystem.get_file_system(src, 'r')
        fs.write_bytes(path, bytes([0x00, 0x01]))
        sync_from_dir(src, src, delete=True)

        self.assertEqual(len(list_paths(src)), 1)
    def test_copy_to_local(self):
        path1 = os.path.join(self.tmp_dir.name, 'lorem', 'ipsum.txt')
        path2 = os.path.join(self.tmp_dir.name, 'yyy', 'ipsum.txt')
        dir1 = os.path.dirname(path1)
        dir2 = os.path.dirname(path2)
        make_dir(dir1, check_empty=False)
        make_dir(dir2, check_empty=False)

        str_to_file(self.lorem, path1)

        upload_or_copy(path1, path2)
        self.assertEqual(len(list_paths(dir2)), 1)
    def test_sync_to_dir_local(self):
        path = os.path.join(self.tmp_dir.name, 'lorem', 'ipsum.txt')
        src = os.path.dirname(path)
        dst = os.path.join(self.tmp_dir.name, 'xxx')
        make_dir(src, check_empty=False)
        make_dir(dst, check_empty=False)

        fs = FileSystem.get_file_system(path, 'r')
        fs.write_bytes(path, bytes([0x00, 0x01]))
        sync_to_dir(src, dst, delete=True)

        self.assertEqual(len(list_paths(dst)), 1)
    def setup_data(self):
        cfg = self.cfg
        batch_sz = cfg.solver.batch_sz
        num_workers = cfg.data.num_workers

        # download and unzip data
        if cfg.data.uri.startswith('s3://') or cfg.data.uri.startswith('/'):
            data_uri = cfg.data.uri
        else:
            data_uri = join(cfg.base_uri, cfg.data.uri)

        data_dirs = []
        zip_uris = [data_uri] if data_uri.endswith('.zip') else list_paths(
            data_uri, 'zip')
        for zip_ind, zip_uri in enumerate(zip_uris):
            zip_path = get_local_path(zip_uri, self.data_cache_dir)
            if not isfile(zip_path):
                zip_path = download_if_needed(zip_uri, self.data_cache_dir)
            with zipfile.ZipFile(zip_path, 'r') as zipf:
                data_dir = join(self.tmp_dir, 'data', str(zip_ind))
                data_dirs.append(data_dir)
                zipf.extractall(data_dir)

        # build datasets -- one per zip file and then merge them into a single dataset
        train_ds = []
        valid_ds = []
        test_ds = []
        for data_dir in data_dirs:
            train_dir = join(data_dir, 'train')
            valid_dir = join(data_dir, 'valid')

            transform = Compose(
                [Resize((cfg.data.img_sz, cfg.data.img_sz)),
                 ToTensor()])
            aug_transform = Compose([
                RandomHorizontalFlip(),
                RandomVerticalFlip(),
                ColorJitter(0.1, 0.1, 0.1, 0.1),
                Resize((cfg.data.img_sz, cfg.data.img_sz)),
                ToTensor()
            ])

            if isdir(train_dir):
                if cfg.overfit_mode:
                    train_ds.append(
                        ImageRegressionDataset(train_dir,
                                               cfg.data.class_names,
                                               transform=transform))
                else:
                    train_ds.append(
                        ImageRegressionDataset(train_dir,
                                               cfg.data.class_names,
                                               transform=aug_transform))

            if isdir(valid_dir):
                valid_ds.append(
                    ImageRegressionDataset(valid_dir,
                                           cfg.data.class_names,
                                           transform=transform))
                test_ds.append(
                    ImageRegressionDataset(valid_dir,
                                           cfg.data.class_names,
                                           transform=transform))

        train_ds, valid_ds, test_ds = \
            ConcatDataset(train_ds), ConcatDataset(valid_ds), ConcatDataset(test_ds)

        if cfg.overfit_mode:
            train_ds = Subset(train_ds, range(batch_sz))
            valid_ds = train_ds
            test_ds = train_ds
        elif cfg.test_mode:
            train_ds = Subset(train_ds, range(batch_sz))
            valid_ds = Subset(valid_ds, range(batch_sz))
            test_ds = Subset(test_ds, range(batch_sz))

        train_dl = DataLoader(train_ds,
                              shuffle=True,
                              batch_size=batch_sz,
                              num_workers=num_workers,
                              pin_memory=True)
        valid_dl = DataLoader(valid_ds,
                              shuffle=True,
                              batch_size=batch_sz,
                              num_workers=num_workers,
                              pin_memory=True)
        test_dl = DataLoader(test_ds,
                             shuffle=True,
                             batch_size=batch_sz,
                             num_workers=num_workers,
                             pin_memory=True)

        self.train_ds, self.valid_ds, self.test_ds = (train_ds, valid_ds,
                                                      test_ds)
        self.train_dl, self.valid_dl, self.test_dl = (train_dl, valid_dl,
                                                      test_dl)