def sample_tiles_from_candidates(self, tiles_samplers): """ Apply a sampler over each satellite image's candidate tiles to generate a list of tiles (= regions of interest) Args: tiles_samplers(list[TilesSampler]|TilesSampler): Returns: in place. assign self.sampled_tiles """ sampled_tiles = [] if not isinstance(tiles_samplers, (list, tuple)): tiles_samplers = [tiles_samplers] self.tiles_samplers = tiles_samplers LOGGER.info("Sampling tiles") for tile_sample in tqdm(self.tiles_samplers, desc="Sampling tiles"): sampled_tiles.extend( tile_sample.sample_tiles_from_candidates(self.candidate_tiles)) LOGGER.info( "Tiles sampled, now generate the dataset using Dataset.generate_tiles_dataset" ) self.sampled_tiles = sampled_tiles
def items_dataset_from_path(path: Optional[str] = None) -> [SatelliteImage]: """ Get a list of Satellite Images items from path Args: path: folder where to look Returns: """ assert path is not None, "Please set folder variable, likely ${TP_DATA}/raw/trainval/" LOGGER.info("Looking in {}".format(path)) items = [] list_images = glob.glob(os.path.join(path, "*.jpg")) def _parse_image(image_file): image_id = os.path.splitext(os.path.basename(image_file))[0] item = SatelliteImage.from_image_id_and_path(image_id, path) # Read the when initialising to put data into cache # LOGGER.info("Found item {}".format(item.image_id)) assert isinstance(item.image, np.ndarray) assert isinstance(item.labels, list) return item dataset = Dataset(items=list_images) dataset = dataset.map(_parse_image, n_jobs=8, desc="Parsing items") dataset.sorted(key=lambda item: item.key) LOGGER.info("Found {} items".format(len(dataset))) return dataset
def list_items_from_path(cls, path=None): """ Get a list of Satellite Images items from path Args: path: folder where to look Returns: list(SatelliteImageItem): """ assert path is not None, "Please set folder variable, likely ${TP_ISAE_DATA}/raw/trainval/" LOGGER.info("Looking in {}".format(path)) items = [] list_images = glob.glob(os.path.join(path, "*.jpg")) for image_file in list_images: image_id = os.path.splitext(os.path.basename(image_file))[0] item = SatelliteImage.from_image_id_and_path(image_id, path=path) # Read the when initialising to put data into cache assert isinstance(item.image, np.ndarray) assert isinstance(item.labels, list) items.append(item) items = list(sorted(items, key=lambda item: item.key)) return items
def generate_candidates_tiles(self, sliding_windows): """ Apply a sliding window over each satellite image to generate a list of tiles (= regions of interest) to sample from Args: sliding_windows(list[SlidingWindow]|SlidingWindow): Returns: in place (assign self.candidate_tiles) """ if not isinstance(sliding_windows, (list, tuple)): sliding_windows = [sliding_windows] self.sliding_windows = sliding_windows sliding_windows = self.sliding_windows items = self.items LOGGER.info("Generating a pool of candidates tiles") candidate_tiles = [] for sliding_window in tqdm(sliding_windows, position=0, desc="Applying slider"): for item in tqdm(items, position=1, desc="On item"): candidate_tiles.extend(sliding_window.get_tiles_for_item(item)) LOGGER.info( "Candidates tiles generated ! Now sample them using Dataset.sample_tiles_from_candidates" ) self.candidate_tiles = list(set(candidate_tiles)) self.found_labels = list_utils.get_labels_in_list(self.candidate_tiles) # Initialise sampled tiles by default (copy candidate tiles) self.sampled_tiles = self.candidate_tiles[:]
def download_eval_data(data_dir=None): """ Download the raw eval data to data dir and extracts Args: data_dir: Returns: """ data_dir = data_dir or os.path.expandvars(os.environ.get("TP_DATA")) LOGGER.info("Downloading evaluation data") _download_data(archive="tp_isae_eval_data.tar.gz", data_dir=data_dir, check_dir="raw/eval") LOGGER.info("Done. Your data is located here {}\n".format(os.path.join(data_dir, "raw", "eval")))
def predict_on_item(item, predictor=None, sliding_windows=None): """ Args: item(SatelliteImage): the item on which to apply the prediction predictor(Predictor): A Predictor object that encapsulates our model sliding_windows(SlidingWindow): The sliding window used to generate candidates Returns: """ if not isinstance(sliding_windows, (list, tuple)): sliding_windows = [sliding_windows] LOGGER.info("Generating tiles to predict") tiles_to_predict = [] for sliding_window in tqdm(sliding_windows, position=0, desc="Applying slider"): tiles_to_predict.extend(sliding_window.get_tiles_for_item(item)) tiles_to_predict = list(set(tiles_to_predict)) LOGGER.info("Generating predicting on item {} with {} tiles".format( item.key, len(tiles_to_predict))) image = item.image tiles_results = [] if hasattr(predictor, "batch_size") and predictor.batch_size > 1: batches = [ tiles_to_predict[i:i + predictor.batch_size] for i in range(0, len(tiles_to_predict), predictor.batch_size) ] for batch in tqdm( batches, desc="Calling .predict_on_batch() with batch_size {}". format(predictor.batch_size)): batch_data = [tile.get_data(image) for tile in batch] batch_results = predictor.predict_on_batch(batch_data) for i, tile in enumerate(batch): tiles_results.append( PredictionTile.from_labelled_tile_and_prediction( tile, batch_results[i])) else: for tile in tqdm(tiles_to_predict, desc="Calling .predict() with one tile"): prediction = predictor.predict(tile.get_data(image)) tiles_results.append( PredictionTile.from_labelled_tile_and_prediction( tile, prediction)) return tiles_results
def predict_on_item(self, item): """ Args: item(SatelliteImage): the item on which to apply the prediction Returns: """ LOGGER.info("Generating tiles to predict") item_dataset = Dataset(items=[item]) tiles = Dataset(items=[]) for sliding_window in tqdm(self.sliding_windows, position=0, desc="Applying slider"): tiles = tiles.extend(item_dataset.flatmap(sliding_window)) tiles = tiles.apply(lambda items: list(set(items))) LOGGER.info("Generating predicting on item {} with {} tiles".format( item.key, len(tiles))) image = item.image def _batch(items): return [ items[i:i + self.predictor.batch_size] for i in range(0, len(items), self.predictor.batch_size) ] batches = tiles.apply(_batch) print(len(tiles)) print(len(batches)) def _predict(batch): batch_data = list(map(lambda tile: tile.get_data(image), batch)) batch_results = self.predictor.predict_on_batch(batch_data) batch_results = list( map( lambda tpl: PredictionTile. from_labelled_tile_and_prediction(tpl[0], tpl[1]), zip(batch, batch_results))) return batch_results tiles_results = batches.flatmap(_predict, desc="Predicting on batch") return tiles_results
def generate_candidate_tiles_from_items(items_dataset: Dataset, sliding_windows: [SlidingWindow], n_jobs: int = 1) -> Dataset: """ High level helper function Apply a sliding window over each satellite image to generate a list of tiles (= regions of interest) to sample from Args: sliding_windows(list[SlidingWindow]|SlidingWindow): items_dataset(Dataset): n_jobs(int): Returns: dataset (Dataset): """ LOGGER.info("Generating a pool of candidates tiles") tiles_dataset = Dataset(items=[]) if not isinstance(sliding_windows, (list, tuple)): sliding_windows = [sliding_windows] for sliding_window in sliding_windows: LOGGER.info(sliding_window) tiles_dataset = tiles_dataset.extend( items_dataset.flatmap(sliding_window, desc="Applying sliding window", n_jobs=n_jobs)) tiles_dataset = tiles_dataset.apply(lambda items: list(set(items))) LOGGER.info("State of dataset") LOGGER.info(roi_list_utils.get_state(tiles_dataset.items)) return tiles_dataset
def _dump_tiles(item): LOGGER.info("Dumping for item {}".format(item.key)) if output_dir is not None: tiles_dumper = ImageItemTileDumper(item, output_dir=output_dir, save_format=save_format) else: tiles_dumper = NpArrayTileDumper(item) tiles_dataset_ = tiles_dataset.filter( lambda tile: tile.item_id == item.key, desc="Filtering") tiles_dataset_ = tiles_dataset_.map( tiles_dumper, desc="Saving tiles to {}".format(output_dir)) return tiles_dataset_.items
def sample_tiles_from_candidates(tiles_dataset: Dataset, tiles_samplers: [TilesSampler]) -> Dataset: """ High level helper function Apply a sampler over each satellite image's candidate tiles to generate a list of tiles (= regions of interest) Args: tiles_samplers(list[TilesSampler]|TilesSampler): tiles_dataset(Dataset) Returns: sampled_dataset(Dataset) """ sampled_dataset = Dataset(items=[]) LOGGER.info("Sampling tiles") if not isinstance(tiles_samplers, (list, tuple)): tiles_samplers = [tiles_samplers] for tiles_sampler in tiles_samplers: LOGGER.info(tiles_sampler) sampled_dataset = sampled_dataset.extend( tiles_dataset.apply(tiles_sampler)) LOGGER.info( "Tiles sampled, now generate the dataset using Dataset.generate_tiles_dataset" ) LOGGER.info(roi_list_utils.get_state(sampled_dataset.items)) return sampled_dataset
def download_train_data(data_dir=None): """ Download the raw training data to data dir and extracts Args: data_dir: Returns: """ data_dir = data_dir or os.environ.get("TP_ISAE_DATA") LOGGER.info("Downloading training data") _download_data(archive="tp_isae_train_data.tar.gz", data_dir=data_dir) LOGGER.info("Done. Your training data is located here {}".format( os.path.join(data_dir, "raw", "trainval")))
def _download_test_ci_data(data_dir=None): """ Reserved for CI Args: data_dir: Returns: """ data_dir = data_dir or os.environ.get("TP_ISAE_DATA") LOGGER.info("Downloading test ci data") _download_data(archive="tp_isae_test_ci.tar.gz", data_dir=data_dir) LOGGER.info( "Done. Your data is located here {}\nYour eval data is located here {}" .format(os.path.join(data_dir, "raw"), os.path.join(data_dir, "raw", "eval")))
def sample_tiles_from_candidates(self, candidate_tiles): """ Apply the sampling logic of this class to a list of `candidates` Args: candidate_tiles(list[Tiles]): List of regions of interest to apply the sampler on Returns: list[Tiles]: Sampled list """ LOGGER.info("Sampling") if self.target_label is not None: candidate_tiles = roi_list_utils.filter_tiles_by_label(candidate_tiles, self.target_label) nb_tiles_max = self.nb_tiles_max or len(candidate_tiles) return self._sample_n_tiles_from_list(candidate_tiles, nb_tiles_max)
def _download_data(archive="tp_isae_data.tar.gz", data_dir=None, check_dir=None): """ Args: archive: data_dir: Returns: """ assert data_dir is not None, "please specify a download dir or better specify TP_DATA env variable" if not os.path.exists(data_dir): os.makedirs(data_dir) LOGGER.info("Downloading data from {} to {}".format(archive, data_dir)) if not os.path.exists(os.path.join(data_dir, archive)): # Download tar gz LOGGER.info("Downloading {}".format("{}/{}").format(ROOT_URL, archive)) cmd = ["curl", "-X", "GET", "{}/{}".format(ROOT_URL, archive), "--output", os.path.join(data_dir, archive)] subprocess.check_call(cmd) if check_dir is None or not os.path.exists(os.path.join(data_dir, check_dir)): # Untar it LOGGER.info("Extracting tar gz") cmd = ["tar", "-zxvf", os.path.join(data_dir, archive), "-C", data_dir] subprocess.check_call(cmd)
def dump_dataset_tiles(tiles_dataset: Dataset, items_dataset: Dataset, output_dir=None, remove_first=False, save_format="jpg") -> Dataset: """ High level helper function Actually generates training images from the dataset.sampled_tiles (= regions of interest) The filestructure is compatible with keras.ImageDataGenerator.flow_from_directory() method For more information on how to parse this, check this script: https://gist.github.com/fchollet/0830affa1f7f19fd47b06d4cf89ed44d In summary, this is our directory structure: ```markdown output_dir/ aircrafts/ ac001.jpg ac002.jpg ... background/ bg001.jpg bg002.jpg ... ``` Args: items_dataset (Dataset): tiles_dataset (Dataset): output_dir(str): the output path save_format: "jpg" the image format remove_first(bool): erase output dir first? Returns: """ LOGGER.info( "Generating a dataset of tiles at location {}".format(output_dir)) if remove_first and output_dir is not None: try: shutil.rmtree(output_dir) except FileNotFoundError: pass def _dump_tiles(item): LOGGER.info("Dumping for item {}".format(item.key)) if output_dir is not None: tiles_dumper = ImageItemTileDumper(item, output_dir=output_dir, save_format=save_format) else: tiles_dumper = NpArrayTileDumper(item) tiles_dataset_ = tiles_dataset.filter( lambda tile: tile.item_id == item.key, desc="Filtering") tiles_dataset_ = tiles_dataset_.map( tiles_dumper, desc="Saving tiles to {}".format(output_dir)) return tiles_dataset_.items dumped_tiles = items_dataset.flatmap(_dump_tiles) return dumped_tiles
def generate_tiles_dataset(self, output_dir=None, save_format="jpg", remove_first=True): """ Actually generates training images from the dataset.sampled_tiles (= regions of interest) The filestructure is compatible with keras.ImageDataGenerator.flow_from_directory() method For more information on how to parse this, check this script: https://gist.github.com/fchollet/0830affa1f7f19fd47b06d4cf89ed44d In summary, this is our directory structure: ```markdown output_dir/ aircrafts/ ac001.jpg ac002.jpg ... background/ bg001.jpg bg002.jpg ... ``` Args: output_dir(str): the output path save_format: "jpg" the image format remove_first(bool): erase output dir first? Returns: """ LOGGER.info( "Generating a dataset of tiles at location {}".format(output_dir)) for label in self.found_labels: if remove_first: shutil.rmtree(os.path.join(output_dir, label)) if not os.path.exists(os.path.join(output_dir, label)): os.makedirs(os.path.join(output_dir, label)) def _generate_tiles(item, tiles): image = item.image tiles = list_utils.filter_tiles_by_item(tiles, item.key) for tile in tiles: tile_data = tile.get_data(image) tile_label = tile.label tile_basename = "{}_{}.{}".format(item.key, tile.key, save_format) io.imsave(os.path.join(output_dir, tile_label, tile_basename), tile_data) items = self.items sampled_tiles = self.sampled_tiles LOGGER.info("Dumping tiles to {}".format(output_dir)) for item in tqdm(items, desc="Saving tiles to {}".format(output_dir)): _generate_tiles(item, sampled_tiles)