Ejemplo n.º 1
0
def cached_download(uris, data_dir):
    paths = []
    for uri in uris:
        path = get_local_path(uri, data_dir)
        paths.append(path)
        if not isfile(path):
            download_if_needed(uri, data_dir)
    return paths
Ejemplo n.º 2
0
    def test_download_if_needed_local(self):
        with self.assertRaises(NotReadableError):
            download_if_needed(self.local_path, self.temp_dir.name)

        str_to_file(self.content_str, self.local_path)
        upload_or_copy(self.local_path, self.local_path)
        local_path = download_if_needed(self.local_path, self.temp_dir.name)
        self.assertEqual(local_path, self.local_path)
Ejemplo n.º 3
0
    def test_copy_from_http(self):
        http_path = ('https://raw.githubusercontent.com/tensorflow/models/'
                     '17fa52864bfc7a7444a8b921d8a8eb1669e14ebd/README.md')
        expected = os.path.join(
            self.temp_dir.name, 'http', 'raw.githubusercontent.com',
            'tensorflow/models',
            '17fa52864bfc7a7444a8b921d8a8eb1669e14ebd/README.md')
        download_if_needed(http_path, self.temp_dir.name)

        self.assertTrue(file_exists(expected))
        os.remove(expected)
Ejemplo n.º 4
0
    def test_download_if_needed_s3(self):
        with self.assertRaises(NotReadableError):
            download_if_needed(self.s3_path, self.temp_dir.name)

        str_to_file(self.content_str, self.local_path)
        upload_or_copy(self.local_path, self.s3_path)
        local_path = download_if_needed(self.s3_path, self.temp_dir.name)
        content_str = file_to_str(local_path)
        self.assertEqual(self.content_str, content_str)

        wrong_path = 's3://wrongpath/x.txt'
        with self.assertRaises(NotWritableError):
            upload_or_copy(local_path, wrong_path)
 def load_model(self, tmp_dir):
     """Load the model in preparation for one or more prediction calls."""
     if self.inf_learner is None:
         model_uri = self.config['model_uri']
         model_path = download_if_needed(model_uri, tmp_dir)
         self.inf_learner = load_learner(dirname(model_path),
                                         basename(model_path))
Ejemplo n.º 6
0
    def load_model(self, tmp_dir):
        from rastervision.backend.keras_classification.builders \
            import model_builder

        if self.model is None:
            model_path = download_if_needed(self.config.model_uri, tmp_dir)
            self.model = model_builder.build_from_path(model_path)
            self.model._make_predict_function()
Ejemplo n.º 7
0
    def load_model(self, tmp_dir):
        import tensorflow as tf

        # Load and memoize the detection graph and TF session.
        if self.detection_graph is None:
            model_path = download_if_needed(self.config.model_uri, tmp_dir)
            self.detection_graph = load_frozen_graph(model_path)
            self.session = tf.Session(graph=self.detection_graph)
Ejemplo n.º 8
0
 def load_model(self, tmp_dir):
     """Load the model in preparation for one or more prediction calls."""
     if self.inf_learner is None:
         self.log_options()
         model_uri = self.backend_opts.model_uri
         model_path = download_if_needed(model_uri, tmp_dir)
         self.inf_learner = load_learner(
             dirname(model_path), basename(model_path))
Ejemplo n.º 9
0
    def from_model_bundle(model_bundle_uri, tmp_dir):
        model_bundle_path = download_if_needed(model_bundle_uri, tmp_dir)
        model_bundle_dir = join(tmp_dir, 'model-bundle')
        unzip(model_bundle_path, model_bundle_dir)

        config_path = join(model_bundle_dir, 'config.json')
        model_path = join(model_bundle_dir, 'model.pth')
        cfg = build_config(file_to_json(config_path))
        return cfg.get_learner()(cfg, tmp_dir, model_path=model_path)
 def load_model(self, tmp_dir):
     """Load the model in preparation for one or more prediction calls."""
     if self.inf_learner is None:
         self.print_options()
         model_uri = self.backend_opts.model_uri
         model_path = download_if_needed(model_uri, tmp_dir)
         self.inf_learner = load_learner(
             dirname(model_path), basename(model_path))
         self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
Ejemplo n.º 11
0
    def _download_data(self, temp_dir):
        """Download any data needed for this Raster Source.

        Return a single local path representing the image or a VRT of the data.
        """
        if len(self.uris) == 1:
            return download_if_needed(self.uris[0], temp_dir)
        else:
            return download_and_build_vrt(self.uris, temp_dir)
Ejemplo n.º 12
0
    def download_if_needed(self, uri):
        """Download file if it's remote.

        Args:
            uri: (string) URI of file, possibly remote

        Returns:
            (string) path of local file that was downloaded
        """
        return download_if_needed(uri, self.temp_dir)
 def load_model(self, tmp_dir):
     """Load the model in preparation for one or more prediction calls."""
     if self.raster_func is None:
         self.print_options()
         model_uri = self.backend_opts.model_uri
         model_path = download_if_needed(model_uri, tmp_dir)
         with open(model_path, 'r') as func_file:
             func_str = func_file.read()
             print('FUNC', func_str)
             parsed_func = self._toolbox.compile(expr=func_str)
             self.raster_func = parsed_func
    def load_model(self, tmp_dir, ctx=None):
        """Load ResNet50v2 and load trained weights.

        Args:
            tmp_dir: temporary directory

        Returns:
            None
        """
        if self.model is None:
            self.model = build_model(ctx, len(self.class_map))
            model_files = ModelFiles(self.config.training_output_uri, tmp_dir)
            model_path = download_if_needed(model_files.model_uri, tmp_dir)
            self.model.load_parameters(model_path)
Ejemplo n.º 15
0
    def load_model(self, tmp_dir):
        """Load the model in preparation for one or more prediction calls."""
        if self.model is None:
            model_uri = self.backend_opts.model_uri
            model_path = download_if_needed(model_uri, tmp_dir)

            num_classes = len(self.class_map)
            model = get_model(self.train_opts.model_arch,
                              num_classes,
                              pretrained=True)
            model = model.to(self.device)
            model.load_state_dict(
                torch.load(model_path, map_location=self.device))
            self.model = model
    def load_model(self, tmp_dir):
        """Load the model in preparation for one or more prediction calls."""
        if self.model is None:
            model_uri = self.backend_opts.model_uri
            model_path = download_if_needed(model_uri, tmp_dir)

            # add one for background class
            num_classes = len(self.task_config.class_map) + 1
            model = MyFasterRCNN(self.train_opts.model_arch,
                                 num_classes,
                                 self.task_config.chip_size,
                                 pretrained=False)
            model = model.to(self.device)
            model.load_state_dict(
                torch.load(model_path, map_location=self.device))
            self.model = model
Ejemplo n.º 17
0
    def load_model(self, tmp_dir: str):
        """Load the model in preparation for one or more prediction calls.

        Args:
             tmp_dir: (str) temporary directory to use
        """
        # noqa Courtesy of https://github.com/tensorflow/models/blob/cbbb2ffcde66e646d4a47628ffe2ece2322b64e8/research/deeplab/deeplab_demo.ipynb
        import tensorflow as tf
        if self.sess is None:
            FROZEN_GRAPH_NAME = download_if_needed(
                self.backend_config.model_uri, tmp_dir)
            graph = tf.Graph()
            with open(FROZEN_GRAPH_NAME, 'rb') as data:
                graph_def = tf.GraphDef.FromString(data.read())
            with graph.as_default():
                tf.import_graph_def(graph_def, name='')
            self.sess = tf.Session(graph=graph)
Ejemplo n.º 18
0
def _postprocess(pred_uri, experiment_id, root_uri):
    tmp_pred_uri = download_if_needed(pred_uri, "/opt/data/predict/")
    tmp_postprocess_uri = tmp_pred_uri.replace("/predict/", "/postprocess/")

    os.makedirs(dirname(tmp_postprocess_uri), exist_ok=True)
    out_uri = join(root_uri, "postprocess", experiment_id, basename(pred_uri))

    with rasterio.open(tmp_pred_uri) as src:
        img = src.read()
        img = np.where(img == 2, 0, img)
        profile = src.profile
    with rasterio.open(tmp_postprocess_uri, "w", **profile) as dst:
        dst.write(img)

    upload_or_copy(tmp_postprocess_uri, out_uri)
    for t in (tmp_pred_uri, tmp_postprocess_uri):
        os.remove(t)
Ejemplo n.º 19
0
    def load_model(self, tmp_dir):
        """Load the model in preparation for one or more prediction calls."""
        import torch
        from tile2vec.tilenet import make_tilenet

        if self.model is None:
            model_path = download_if_needed(self.backend_config.model_uri,
                                            tmp_dir)

            # TODO: config
            in_channels = len(
                self.backend_config.scenes[0].raster_source.channel_order)
            z_dim = 512

            self.model = make_tilenet(in_channels=in_channels, z_dim=z_dim)
            if self.cuda: self.model.cuda()
            self.model.load_state_dict(torch.load(model_path))
            self.model.eval()
Ejemplo n.º 20
0
    def _load_from_files(self, plugin_paths):
        if not plugin_paths:
            return

        self.plugin_sources = []

        plugin_base = PluginBase(package='rastervision.plugins')
        for uri in plugin_paths:
            plugin_name = os.path.splitext(os.path.basename(uri))[0]
            plugin_path = os.path.join(self.plugin_root_dir, plugin_name)
            fs = rv._registry.get_file_system(uri, search_plugins=False)
            local_path = download_if_needed(uri, plugin_path, fs=fs)
            local_dir = os.path.dirname(local_path)

            plugin_source = plugin_base.make_plugin_source(
                searchpath=[local_dir])

            # We're required to hang onto the source
            # to keep it from getting GC'd.
            self.plugin_sources.append(plugin_source)

            self._load_plugin(plugin_source.load_plugin(plugin_name), uri)
 def create_local(self, tmp_dir):
     new_uri = download_if_needed(self.uri, tmp_dir)
     return self.to_builder() \
                .with_uri(new_uri) \
                .build()
Ejemplo n.º 22
0
    def train(self, tmp_dir):
        """Train a model.

        This downloads any previous output saved to the train_uri,
        starts training (or resumes from a checkpoint), periodically
        syncs contents of train_dir to train_uri and after training finishes.

        Args:
            tmp_dir: (str) path to temp directory
        """
        self.log_options()

        # Sync output of previous training run from cloud.
        train_uri = self.backend_opts.train_uri
        train_dir = get_local_path(train_uri, tmp_dir)
        make_dir(train_dir)
        sync_from_dir(train_uri, train_dir)

        # Get zip file for each group, and unzip them into chip_dir.
        chip_dir = join(tmp_dir, 'chips')
        make_dir(chip_dir)
        for zip_uri in list_paths(self.backend_opts.chip_uri, 'zip'):
            zip_path = download_if_needed(zip_uri, tmp_dir)
            with zipfile.ZipFile(zip_path, 'r') as zipf:
                zipf.extractall(chip_dir)

        # Setup data loader.
        batch_size = self.train_opts.batch_size
        chip_size = self.task_config.chip_size
        class_names = self.class_map.get_class_names()
        databunch = build_databunch(chip_dir, chip_size, batch_size,
                                    class_names)
        log.info(databunch)
        num_labels = len(databunch.label_names)
        if self.train_opts.debug:
            make_debug_chips(databunch, self.class_map, tmp_dir, train_uri)

        # Setup model
        num_labels = len(databunch.label_names)
        model = get_model(self.train_opts.model_arch,
                          num_labels,
                          pretrained=True)
        model = model.to(self.device)
        model_path = join(train_dir, 'model')

        # Load weights from a pretrained model.
        pretrained_uri = self.backend_opts.pretrained_uri
        if pretrained_uri:
            log.info('Loading weights from pretrained_uri: {}'.format(
                pretrained_uri))
            pretrained_path = download_if_needed(pretrained_uri, tmp_dir)
            model.load_state_dict(
                torch.load(pretrained_path, map_location=self.device))

        # Possibly resume training from checkpoint.
        start_epoch = 0
        train_state_path = join(train_dir, 'train_state.json')
        if isfile(train_state_path):
            log.info('Resuming from checkpoint: {}\n'.format(model_path))
            train_state = file_to_json(train_state_path)
            start_epoch = train_state['epoch'] + 1
            model.load_state_dict(
                torch.load(model_path, map_location=self.device))

        # Write header of log CSV file.
        metric_names = ['precision', 'recall', 'f1']
        log_path = join(train_dir, 'log.csv')
        if not isfile(log_path):
            with open(log_path, 'w') as log_file:
                log_writer = csv.writer(log_file)
                row = ['epoch', 'time', 'train_loss'] + metric_names
                log_writer.writerow(row)

        # Setup Tensorboard logging.
        if self.train_opts.log_tensorboard:
            log_dir = join(train_dir, 'tb-logs')
            make_dir(log_dir)
            tb_writer = SummaryWriter(log_dir=log_dir)
            if self.train_opts.run_tensorboard:
                log.info('Starting tensorboard process')
                tensorboard_process = Popen(
                    ['tensorboard', '--logdir={}'.format(log_dir)])
                terminate_at_exit(tensorboard_process)

        # Setup optimizer, loss, and LR scheduler.
        loss_fn = torch.nn.CrossEntropyLoss()
        lr = self.train_opts.lr
        opt = optim.Adam(model.parameters(), lr=lr)
        step_scheduler, epoch_scheduler = None, None
        num_epochs = self.train_opts.num_epochs

        if self.train_opts.one_cycle and num_epochs > 1:
            steps_per_epoch = len(databunch.train_ds) // batch_size
            total_steps = num_epochs * steps_per_epoch
            step_size_up = (num_epochs // 2) * steps_per_epoch
            step_size_down = total_steps - step_size_up
            step_scheduler = CyclicLR(opt,
                                      base_lr=lr / 10,
                                      max_lr=lr,
                                      step_size_up=step_size_up,
                                      step_size_down=step_size_down,
                                      cycle_momentum=False)
            for _ in range(start_epoch * steps_per_epoch):
                step_scheduler.step()

        # Training loop.
        for epoch in range(start_epoch, num_epochs):
            # Train one epoch.
            log.info('-----------------------------------------------------')
            log.info('epoch: {}'.format(epoch))
            start = time.time()
            train_loss = train_epoch(model, self.device, databunch.train_dl,
                                     opt, loss_fn, step_scheduler)
            if epoch_scheduler:
                epoch_scheduler.step()
            log.info('train loss: {}'.format(train_loss))

            # Validate one epoch.
            metrics = validate_epoch(model, self.device, databunch.valid_dl,
                                     num_labels)
            log.info('validation metrics: {}'.format(metrics))

            # Print elapsed time for epoch.
            end = time.time()
            epoch_time = datetime.timedelta(seconds=end - start)
            log.info('epoch elapsed time: {}'.format(epoch_time))

            # Save model and state.
            torch.save(model.state_dict(), model_path)
            train_state = {'epoch': epoch}
            json_to_file(train_state, train_state_path)

            # Append to log CSV file.
            with open(log_path, 'a') as log_file:
                log_writer = csv.writer(log_file)
                row = [epoch, epoch_time, train_loss]
                row += [metrics[k] for k in metric_names]
                log_writer.writerow(row)

            # Write to Tensorboard log.
            if self.train_opts.log_tensorboard:
                for key, val in metrics.items():
                    tb_writer.add_scalar(key, val, epoch)
                tb_writer.add_scalar('train_loss', train_loss, epoch)
                for name, param in model.named_parameters():
                    tb_writer.add_histogram(name, param, epoch)

            if (train_uri.startswith('s3://')
                    and (((epoch + 1) % self.train_opts.sync_interval) == 0)):
                sync_to_dir(train_dir, train_uri)

        # Close Tensorboard.
        if self.train_opts.log_tensorboard:
            tb_writer.close()
            if self.train_opts.run_tensorboard:
                tensorboard_process.terminate()

        # Since model is exported every epoch, we need some other way to
        # show that training is finished.
        str_to_file('done!', self.backend_opts.train_done_uri)

        # Sync output to cloud.
        sync_to_dir(train_dir, self.backend_opts.train_uri)
Ejemplo n.º 23
0
    def build_data(self):
        cfg = self.cfg
        batch_sz = cfg.solver.batch_sz
        num_workers = cfg.data.num_workers
        label_names = cfg.data.labels

        # download and unzip data
        if cfg.data.data_format == 'image_folder':
            if cfg.data.uri.startswith('s3://') or cfg.data.uri.startswith(
                    '/'):
                data_uri = cfg.data.uri
            else:
                data_uri = join(cfg.base_uri, cfg.data.uri)

            data_dirs = []
            zip_uris = [data_uri] if data_uri.endswith('.zip') else list_paths(
                data_uri, 'zip')
            for zip_ind, zip_uri in enumerate(zip_uris):
                zip_path = get_local_path(zip_uri, self.data_cache_dir)
                if not isfile(zip_path):
                    zip_path = download_if_needed(zip_uri, self.data_cache_dir)
                with zipfile.ZipFile(zip_path, 'r') as zipf:
                    data_dir = join(self.tmp_dir, 'data', str(zip_ind))
                    data_dirs.append(data_dir)
                    zipf.extractall(data_dir)

        train_ds, valid_ds, test_ds = [], [], []
        for data_dir in data_dirs:
            train_dir = join(data_dir, 'train')
            valid_dir = join(data_dir, 'valid')

            # build datasets
            transform = Compose(
                [Resize((cfg.data.img_sz, cfg.data.img_sz)),
                 ToTensor()])
            aug_transform = Compose([
                RandomHorizontalFlip(),
                RandomVerticalFlip(),
                ColorJitter(0.1, 0.1, 0.1, 0.1),
                Resize((cfg.data.img_sz, cfg.data.img_sz)),
                ToTensor()
            ])

            if isdir(train_dir):
                if cfg.overfit_mode:
                    train_ds.append(
                        ImageFolder(
                            train_dir,
                            transform=transform,
                            classes=label_names))
                else:
                    train_ds.append(
                        ImageFolder(
                            train_dir,
                            transform=aug_transform,
                            classes=label_names))

            if isdir(valid_dir):
                valid_ds.append(
                    ImageFolder(
                        valid_dir, transform=transform, classes=label_names))
                test_ds.append(
                    ImageFolder(
                        valid_dir, transform=transform, classes=label_names))

        train_ds, valid_ds, test_ds = \
            ConcatDataset(train_ds), ConcatDataset(valid_ds), ConcatDataset(test_ds)

        if cfg.overfit_mode:
            train_ds = Subset(train_ds, range(batch_sz))
            valid_ds = train_ds
            test_ds = train_ds
        elif cfg.test_mode:
            train_ds = Subset(train_ds, range(batch_sz))
            valid_ds = Subset(valid_ds, range(batch_sz))
            test_ds = Subset(test_ds, range(batch_sz))

        train_dl = DataLoader(
            train_ds,
            shuffle=True,
            batch_size=batch_sz,
            num_workers=num_workers,
            pin_memory=True)
        valid_dl = DataLoader(
            valid_ds,
            shuffle=True,
            batch_size=batch_sz,
            num_workers=num_workers,
            pin_memory=True)
        test_dl = DataLoader(
            test_ds,
            shuffle=True,
            batch_size=batch_sz,
            num_workers=num_workers,
            pin_memory=True)

        self.train_ds, self.valid_ds, self.test_ds = (train_ds, valid_ds,
                                                      test_ds)
        self.train_dl, self.valid_dl, self.test_dl = (train_dl, valid_dl,
                                                      test_dl)
Ejemplo n.º 24
0
 def create_local(self, tmp_dir):
     new_uris = [download_if_needed(uri, tmp_dir) for uri in self.uris]
     return self.to_builder() \
                .with_uris(new_uris) \
                .build()
Ejemplo n.º 25
0
def _zxy2geotiff(tile_schema, zoom, bounds, output_uri, make_cog=False):
    """Generates a GeoTIFF of a bounded region from a ZXY tile server.

    Args:
        tile_schema: (str) the URI schema for zxy tiles (ie. a slippy map tile server)
            of the form /tileserver-uri/{z}/{x}/{y}.png. If {-y} is used, the tiles
            are assumed to be indexed using TMS coordinates, where the y axis starts
            at the southernmost point. The URI can be for http, S3, or the local
            file system.
        zoom: (int) the zoom level to use when retrieving tiles
        bounds: (list) a list of length 4 containing min_lat, min_lng,
            max_lat, max_lng
        output_uri: (str) where to save the GeoTIFF. The URI can be for http, S3, or the
            local file system
    """
    min_lat, min_lng, max_lat, max_lng = bounds
    if min_lat >= max_lat:
        raise ValueError('min_lat must be < max_lat')
    if min_lng >= max_lng:
        raise ValueError('min_lng must be < max_lng')

    is_tms = False
    if '{-y}' in tile_schema:
        tile_schema = tile_schema.replace('{-y}', '{y}')
        is_tms = True

    tmp_dir_obj = tempfile.TemporaryDirectory()
    tmp_dir = tmp_dir_obj.name

    # Get range of tiles that cover bounds.
    output_path = get_local_path(output_uri, tmp_dir)
    tile_sz = 256
    t = mercantile.tile(min_lng, max_lat, zoom)
    xmin, ymin = t.x, t.y
    t = mercantile.tile(max_lng, min_lat, zoom)
    xmax, ymax = t.x, t.y

    # The supplied bounds are contained within the "tile bounds" -- ie. the
    # bounds of the set of tiles that covers the supplied bounds. Therefore,
    # we need to crop out the imagery that lies within the supplied bounds.
    # We do this by computing a top, bottom, left, and right offset in pixel
    # units of the supplied bounds against the tile bounds. Getting the offsets
    # in pixel units involves converting lng/lat to web mercator units since we
    # assume that is the CRS of the tiles. These offsets are then used to crop
    # individual tiles and place them correctly into the output raster.
    nw_merc_x, nw_merc_y = lnglat2merc(min_lng, max_lat)
    left_pix_offset, top_pix_offset = merc2pixel(xmin, ymin, zoom, nw_merc_x,
                                                 nw_merc_y)

    se_merc_x, se_merc_y = lnglat2merc(max_lng, min_lat)
    se_left_pix_offset, se_top_pix_offset = merc2pixel(xmax, ymax, zoom,
                                                       se_merc_x, se_merc_y)
    right_pix_offset = tile_sz - se_left_pix_offset
    bottom_pix_offset = tile_sz - se_top_pix_offset

    uncropped_height = tile_sz * (ymax - ymin + 1)
    uncropped_width = tile_sz * (xmax - xmin + 1)
    height = uncropped_height - top_pix_offset - bottom_pix_offset
    width = uncropped_width - left_pix_offset - right_pix_offset

    transform = rasterio.transform.from_bounds(nw_merc_x, se_merc_y, se_merc_x,
                                               nw_merc_y, width, height)
    with rasterio.open(output_path,
                       'w',
                       driver='GTiff',
                       height=height,
                       width=width,
                       count=3,
                       crs='epsg:3857',
                       transform=transform,
                       dtype=rasterio.uint8) as dataset:
        out_x = 0
        for xi, x in enumerate(range(xmin, xmax + 1)):
            tile_xmin, tile_xmax = 0, tile_sz - 1
            if x == xmin:
                tile_xmin += left_pix_offset
            if x == xmax:
                tile_xmax -= right_pix_offset
            window_width = tile_xmax - tile_xmin + 1

            out_y = 0
            for yi, y in enumerate(range(ymin, ymax + 1)):
                tile_ymin, tile_ymax = 0, tile_sz - 1
                if y == ymin:
                    tile_ymin += top_pix_offset
                if y == ymax:
                    tile_ymax -= bottom_pix_offset
                window_height = tile_ymax - tile_ymin + 1

                # Convert from xyz to tms if needed.
                # https://gist.github.com/tmcw/4954720
                if is_tms:
                    y = (2**zoom) - y - 1
                tile_uri = tile_schema.format(x=x, y=y, z=zoom)
                tile_path = download_if_needed(tile_uri, tmp_dir)
                img = np.array(Image.open(tile_path))
                img = img[tile_ymin:tile_ymax + 1, tile_xmin:tile_xmax + 1, :]

                window = Window(out_x, out_y, window_width, window_height)
                dataset.write(np.transpose(img[:, :, 0:3], (2, 0, 1)),
                              window=window)
                out_y += window_height
            out_x += window_width

    if make_cog:
        create_cog(output_path, output_uri, tmp_dir)
    else:
        upload_or_copy(output_path, output_uri)
Ejemplo n.º 26
0
    def train(self, tmp_dir):
        """Train a model."""
        self.print_options()

        # Sync output of previous training run from cloud.
        train_uri = self.backend_opts.train_uri
        train_dir = get_local_path(train_uri, tmp_dir)
        make_dir(train_dir)
        sync_from_dir(train_uri, train_dir)
        '''
            Get zip file for each group, and unzip them into chip_dir in a
            way that works well with FastAI.

            The resulting directory structure would be:
            <chip_dir>/
                train/
                    training-<uuid1>/
                        <class1>/
                            ...
                        <class2>/
                            ...
                        ...
                    training-<uuid2>/
                        <class1>/
                            ...
                        <class2>/
                            ...
                        ...
                    ...
                val/
                    validation-<uuid1>/
                        <class1>/
                            ...
                        <class2>/
                            ...
                        ...
                    validation-<uuid2>/
                        <class1>/
                            ...
                        <class2>/
                            ...
                        ...
                    ...

        '''
        chip_dir = join(tmp_dir, 'chips/')
        make_dir(chip_dir)
        for zip_uri in list_paths(self.backend_opts.chip_uri, 'zip'):
            zip_name = Path(zip_uri).name
            if zip_name.startswith('train'):
                extract_dir = chip_dir + 'train/'
            elif zip_name.startswith('val'):
                extract_dir = chip_dir + 'val/'
            else:
                continue
            zip_path = download_if_needed(zip_uri, tmp_dir)
            with zipfile.ZipFile(zip_path, 'r') as zipf:
                zipf.extractall(extract_dir)

        # Setup data loader.
        def get_label_path(im_path):
            return Path(str(im_path.parent)[:-4] + '-labels') / im_path.name

        size = self.task_config.chip_size
        class_map = self.task_config.class_map
        classes = class_map.get_class_names()
        num_workers = 0 if self.train_opts.debug else 4
        tfms = get_transforms(flip_vert=self.train_opts.flip_vert)

        def get_data(train_sampler=None):
            data = (ImageList.from_folder(chip_dir).split_by_folder(
                train='train', valid='val').label_from_folder().transform(
                    tfms, size=size).databunch(
                        bs=self.train_opts.batch_sz,
                        num_workers=num_workers,
                    ))
            return data

        data = get_data()

        if self.train_opts.debug:
            make_debug_chips(data, class_map, tmp_dir, train_uri)

        # Setup learner.
        ignore_idx = -1
        metrics = [
            Precision(average='weighted', clas_idx=1, ignore_idx=ignore_idx),
            Recall(average='weighted', clas_idx=1, ignore_idx=ignore_idx),
            FBeta(average='weighted',
                  clas_idx=1,
                  beta=1,
                  ignore_idx=ignore_idx)
        ]
        model_arch = getattr(models, self.train_opts.model_arch)
        learn = cnn_learner(data,
                            model_arch,
                            metrics=metrics,
                            wd=self.train_opts.weight_decay,
                            path=train_dir)

        learn.unfreeze()

        if self.train_opts.fp16 and torch.cuda.is_available():
            # This loss_scale works for Resnet 34 and 50. You might need to adjust this
            # for other models.
            learn = learn.to_fp16(loss_scale=256)

        # Setup callbacks and train model.
        model_path = get_local_path(self.backend_opts.model_uri, tmp_dir)

        pretrained_uri = self.backend_opts.pretrained_uri
        if pretrained_uri:
            print('Loading weights from pretrained_uri: {}'.format(
                pretrained_uri))
            pretrained_path = download_if_needed(pretrained_uri, tmp_dir)
            learn.model.load_state_dict(torch.load(
                pretrained_path, map_location=learn.data.device),
                                        strict=False)

        # Save every epoch so that resume functionality provided by
        # TrackEpochCallback will work.
        callbacks = [
            TrackEpochCallback(learn),
            MySaveModelCallback(learn, every='epoch'),
            MyCSVLogger(learn, filename='log'),
            ExportCallback(learn, model_path, monitor='f_beta'),
            SyncCallback(train_dir, self.backend_opts.train_uri,
                         self.train_opts.sync_interval)
        ]

        lr = self.train_opts.lr
        num_epochs = self.train_opts.num_epochs
        if self.train_opts.one_cycle:
            if lr is None:
                learn.lr_find()
                learn.recorder.plot(suggestion=True, return_fig=True)
                lr = learn.recorder.min_grad_lr
                print('lr_find() found lr: {}'.format(lr))
            learn.fit_one_cycle(num_epochs, lr, callbacks=callbacks)
        else:
            learn.fit(num_epochs, lr, callbacks=callbacks)

        # Since model is exported every epoch, we need some other way to
        # show that training is finished.
        str_to_file('done!', self.backend_opts.train_done_uri)

        # Sync output to cloud.
        sync_to_dir(train_dir, self.backend_opts.train_uri)
Ejemplo n.º 27
0
 def download_if_needed(self, uri):
     return download_if_needed(uri, self.tmp_dir)
Ejemplo n.º 28
0
    def train(self, tmp_dir: str) -> None:
        """Train a DeepLab model the task and backend config.

        Args:
            tmp_dir: (str) temporary directory to use

        Returns:
             None
        """
        train_py = self.backend_config.script_locations.train_py
        eval_py = self.backend_config.script_locations.eval_py
        export_py = self.backend_config.script_locations.export_py

        # Setup local input and output directories
        log.info('Setting up local input and output directories')
        train_logdir = self.backend_config.training_output_uri
        train_logdir_local = get_local_path(train_logdir, tmp_dir)
        dataset_dir = get_record_dir(self.backend_config.training_data_uri,
                                     TRAIN)
        dataset_dir_local = get_local_path(dataset_dir, tmp_dir)
        make_dir(tmp_dir)
        make_dir(train_logdir_local)
        make_dir(dataset_dir_local)

        # Download training data
        log.info('Downloading training data')
        for i, record_file in enumerate(list_paths(dataset_dir)):
            download_if_needed(record_file, tmp_dir)

        # Download and untar initial checkpoint.
        log.info('Downloading and untarring initial checkpoint')
        tf_initial_checkpoints_uri = self.backend_config.pretrained_model_uri
        download_if_needed(tf_initial_checkpoints_uri, tmp_dir)
        tfic_tarball = get_local_path(tf_initial_checkpoints_uri, tmp_dir)
        tfic_dir = os.path.dirname(tfic_tarball)
        with tarfile.open(tfic_tarball, 'r:gz') as tar:
            tar.extractall(tfic_dir)
        tfic_ckpt = glob.glob('{}/*/*.index'.format(tfic_dir))[0]
        tfic_ckpt = tfic_ckpt[0:-len('.index')]

        # Restart support
        train_restart_dir = self.backend_config.train_options.train_restart_dir
        if type(train_restart_dir) is not str or len(train_restart_dir) == 0:
            train_restart_dir = train_logdir

        # Get output from potential previous run so we can resume training.
        if type(train_restart_dir) is str and len(
                train_restart_dir
        ) > 0 and not self.backend_config.train_options.replace_model:
            sync_from_dir(train_restart_dir, train_logdir_local)
        else:
            if self.backend_config.train_options.replace_model:
                if os.path.exists(train_logdir_local):
                    shutil.rmtree(train_logdir_local)
                make_dir(train_logdir_local)

        # Periodically synchronize with remote
        sync = start_sync(
            train_logdir_local,
            train_logdir,
            sync_interval=self.backend_config.train_options.sync_interval)

        with sync:
            # Setup TFDL config
            tfdl_config = json_format.ParseDict(
                self.backend_config.tfdl_config, TrainingParametersMsg())
            log.info('tfdl_config={}'.format(tfdl_config))
            log.info('Training steps={}'.format(
                tfdl_config.training_number_of_steps))

            # Additional training options
            max_class = max(
                list(map(lambda c: c.id, self.class_map.get_items())))
            num_classes = len(self.class_map.get_items())
            num_classes = max(max_class, num_classes) + 1
            (train_args, train_env) = get_training_args(
                train_py, train_logdir_local, tfic_ckpt, dataset_dir_local,
                num_classes, tfdl_config)

            # Start training
            log.info('Starting training process')
            log.info(' '.join(train_args))
            train_process = Popen(train_args, env=train_env)
            terminate_at_exit(train_process)

            if self.backend_config.train_options.do_monitoring:
                # Start tensorboard
                log.info('Starting tensorboard process')
                tensorboard_process = Popen(
                    ['tensorboard', '--logdir={}'.format(train_logdir_local)])
                terminate_at_exit(tensorboard_process)

            if self.backend_config.train_options.do_eval:
                # Start eval script
                log.info('Starting eval script')
                eval_logdir = train_logdir_local
                eval_args = get_evaluation_args(eval_py, train_logdir_local,
                                                dataset_dir_local, eval_logdir,
                                                tfdl_config)
                eval_process = Popen(eval_args, env=train_env)
                terminate_at_exit(eval_process)

            # Wait for training and tensorboard
            log.info('Waiting for training and tensorboard processes')
            train_process.wait()
            if self.backend_config.train_options.do_monitoring:
                tensorboard_process.terminate()

            # Export frozen graph
            log.info(
                'Exporting frozen graph ({}/model)'.format(train_logdir_local))
            export_args = get_export_args(export_py, train_logdir_local,
                                          num_classes, tfdl_config)
            export_process = Popen(export_args)
            terminate_at_exit(export_process)
            export_process.wait()

            # Package up the model files for usage as fine tuning checkpoints
            fine_tune_checkpoint_name = self.backend_config.fine_tune_checkpoint_name
            latest_checkpoints = get_latest_checkpoint(train_logdir_local)
            model_checkpoint_files = glob.glob(
                '{}*'.format(latest_checkpoints))
            inference_graph_path = os.path.join(train_logdir_local, 'model')

            with RVConfig.get_tmp_dir() as tmp_dir:
                model_dir = os.path.join(tmp_dir, fine_tune_checkpoint_name)
                make_dir(model_dir)
                model_tar = os.path.join(
                    train_logdir_local,
                    '{}.tar.gz'.format(fine_tune_checkpoint_name))
                shutil.copy(inference_graph_path,
                            '{}/frozen_inference_graph.pb'.format(model_dir))
                for path in model_checkpoint_files:
                    shutil.copy(path, model_dir)
                with tarfile.open(model_tar, 'w:gz') as tar:
                    tar.add(model_dir, arcname=os.path.basename(model_dir))

        # Perform final sync
        sync_to_dir(train_logdir_local, train_logdir, delete=False)
Ejemplo n.º 29
0
 def load_init_weights(self):
     if self.cfg.model.init_weights:
         weights_path = download_if_needed(self.cfg.model.init_weights,
                                           self.tmp_dir)
         self.model.load_state_dict(
             torch.load(weights_path, map_location=self.device))
    def train(self, tmp_dir):
        """Train a model."""
        self.print_options()

        # Sync output of previous training run from cloud.
        train_uri = self.backend_opts.train_uri
        train_dir = get_local_path(train_uri, tmp_dir)
        make_dir(train_dir)
        sync_from_dir(train_uri, train_dir)

        # Get zip file for each group, and unzip them into chip_dir.
        chip_dir = join(tmp_dir, 'chips')
        make_dir(chip_dir)
        for zip_uri in list_paths(self.backend_opts.chip_uri, 'zip'):
            zip_path = download_if_needed(zip_uri, tmp_dir)
            with zipfile.ZipFile(zip_path, 'r') as zipf:
                zipf.extractall(chip_dir)

        # Setup data loader.
        train_images = []
        train_lbl_bbox = []
        for annotation_path in glob.glob(join(chip_dir, 'train/*.json')):
            images, lbl_bbox = get_annotations(annotation_path)
            train_images += images
            train_lbl_bbox += lbl_bbox

        val_images = []
        val_lbl_bbox = []
        for annotation_path in glob.glob(join(chip_dir, 'valid/*.json')):
            images, lbl_bbox = get_annotations(annotation_path)
            val_images += images
            val_lbl_bbox += lbl_bbox

        images = train_images + val_images
        lbl_bbox = train_lbl_bbox + val_lbl_bbox

        img2bbox = dict(zip(images, lbl_bbox))
        get_y_func = lambda o: img2bbox[o.name]
        num_workers = 0 if self.train_opts.debug else 4
        data = ObjectItemList.from_folder(chip_dir)
        data = data.split_by_folder()
        data = data.label_from_func(get_y_func)
        data = data.transform(
            get_transforms(), size=self.task_config.chip_size, tfm_y=True)
        data = data.databunch(
            bs=self.train_opts.batch_sz, collate_fn=bb_pad_collate,
            num_workers=num_workers)
        print(data)

        if self.train_opts.debug:
            make_debug_chips(
                data, self.task_config.class_map, tmp_dir, train_uri)

        # Setup callbacks and train model.
        ratios = [1/2, 1, 2]
        scales = [1, 2**(-1/3), 2**(-2/3)]
        model_arch = getattr(models, self.train_opts.model_arch)
        encoder = create_body(model_arch, cut=-2)
        model = RetinaNet(encoder, data.c, final_bias=-4)
        crit = RetinaNetFocalLoss(scales=scales, ratios=ratios)
        learn = Learner(data, model, loss_func=crit, path=train_dir)
        learn = learn.split(retina_net_split)

        model_path = get_local_path(self.backend_opts.model_uri, tmp_dir)

        pretrained_uri = self.backend_opts.pretrained_uri
        if pretrained_uri:
            print('Loading weights from pretrained_uri: {}'.format(
                pretrained_uri))
            pretrained_path = download_if_needed(pretrained_uri, tmp_dir)
            learn.load(pretrained_path[:-4])

        callbacks = [
            TrackEpochCallback(learn),
            SaveModelCallback(learn, every='epoch'),
            MyCSVLogger(learn, filename='log'),
            ExportCallback(learn, model_path),
            SyncCallback(train_dir, self.backend_opts.train_uri,
                         self.train_opts.sync_interval)
        ]
        learn.unfreeze()
        learn.fit(self.train_opts.num_epochs, self.train_opts.lr,
                  callbacks=callbacks)

        # Since model is exported every epoch, we need some other way to
        # show that training is finished.
        str_to_file('done!', self.backend_opts.train_done_uri)

        # Sync output to cloud.
        sync_to_dir(train_dir, self.backend_opts.train_uri)