def query_cells(xcells, ycells, satellites, min_date, max_date, dataset_types, output_dir): """ Query the DB for each cell. Currently the config file for this workflow requires the user to specify a rectangular region. I have another workflow that takes a vector file as input. """ base_out_fname = CONFIG.get('outputs', 'query_filename') cell_queries = [] for ycell in ycells: for xcell in xcells: # create the output directory cell_dir = '{}_{}'.format(int(xcell), int(ycell)) out_cell_dir = pjoin(output_dir, cell_dir) if not exists(out_cell_dir): os.makedirs(out_cell_dir) tiles = list_tiles_as_list(x=[xcell], y=[ycell], acq_min=min_date, acq_max=max_date, dataset_types=dataset_types, satellites=satellites) out_fname = pjoin(out_cell_dir, base_out_fname) cell_queries.append(out_fname) with open(out_fname, 'w') as outf: pickle.dump(tiles, outf) return cell_queries
def test_list_tiles_120_020_2005_ls578_no_ls8_pre_wrs_2(config=None): dataset_types = [DatasetType.ARG25, DatasetType.PQ25, DatasetType.FC25] tiles = list_tiles_as_list(x=[TEST_CELL_X], y=[TEST_CELL_Y], acq_min=parse_date_min(TEST_YEAR_STR), acq_max=parse_date_max(TEST_YEAR_STR), satellites=[Satellite.LS5, Satellite.LS7, Satellite.LS8], dataset_types=dataset_types, exclude=[LS8_PRE_WRS_2_EXCLUSION], config=config) assert(tiles and len(list(tiles)) > 0) for tile in tiles: _log.info("Found tile xy = %s", tile.xy) dataset = tile.datasets[DatasetType.ARG25] assert dataset _log.info("Found ARG25 dataset [%s]", dataset.path) assert(tile.x == TEST_CELL_X and tile.y == TEST_CELL_Y and tile.xy == (TEST_CELL_X, TEST_CELL_Y) and tile.end_datetime_year == TEST_YEAR and (ds in tile.datasets for ds in dataset_types) and (dataset.satellite != Satellite.LS8 or tile.end_datetime.date() >= LS8_PRE_WRS_2_ACQ_MAX))
def query_cells2(xcells, ycells, satellites, min_date, max_date, dataset_types, output_dir): """ Query the DB for each cell. Currently the config file for this workflow requires the user to specify a rectangular region. I have another workflow that takes a vector file as input. """ base_out_fname = CONFIG.get('outputs', 'query_filename') cell_queries = [] for i in range(len(ycells)): ycell = ycells[i] xcell = xcells[i] # create the output directory #cell_dir = '{}_{}'.format(int(xcell), int(ycell)) cell_dir = str(xcell) + "_" + str(ycell).zfill(4) out_cell_dir = pjoin(output_dir, cell_dir) if not exists(out_cell_dir): os.makedirs(out_cell_dir) tiles = list_tiles_as_list(x=[xcell], y=[ycell], acq_min=min_date, acq_max=max_date, dataset_types=dataset_types, satellites=satellites) out_fname = pjoin(out_cell_dir, base_out_fname) cell_queries.append(out_fname) with open(out_fname, 'w') as outf: pickle.dump(tiles, outf) return cell_queries
def test_retrieve_data_ls5_mndwi(config=None): filename = "LS5_TM_MNDWI_{x:03d}_{y:04d}_{date}.{x_offset:04d}_{y_offset:04d}.{x_size:04d}x{y_size:04d}.tif".format(x=CELL_X, y=CELL_Y, date=DATE, x_offset=X_OFFSET, y_offset=Y_OFFSET, x_size=X_SIZE, y_size=Y_SIZE) tiles = list_tiles_as_list(x=[CELL_X], y=[CELL_Y], acq_min=ACQ_LS5, acq_max=ACQ_LS5, satellites=[Satellite.LS5], dataset_types=[MNDWI_DATASET_TYPE], config=config) assert len(tiles) == 1 dataset = tiles[0].datasets[MNDWI_DATASET_TYPE] data = get_dataset_data(dataset=dataset, x=X_OFFSET, y=Y_OFFSET, x_size=X_SIZE, y_size=Y_SIZE) assert(data) _log.info("data is [%s]\n%s", numpy.shape(data), data) ndv = get_dataset_ndv(dataset) assert(is_ndv(ndv, MNDWI_NDV)) data_type = get_dataset_datatype(dataset) assert(data_type == MNDWI_DATA_TYPE) metadata = generate_dataset_metadata(x=CELL_X, y=CELL_Y, acq_dt=ACQ_LS5, dataset=dataset, bands=None, mask_pqa_apply=False, mask_pqa_mask=None, mask_wofs_apply=False, mask_wofs_mask=None) raster_create_geotiff(filename, [data[b] for b in dataset.bands], CELL_GEO_TRANSFORM, CELL_PROJECTION, ndv, data_type, dataset_metadata=metadata, band_ids=[b.name for b in dataset.bands]) assert filecmp.cmp(filename, get_test_data_path(filename))
def test_get_dataset_stack(): tiles = list_tiles_as_list(x=[120], y=[-20], satellites=[Satellite.LS5, Satellite.LS7], acq_min=date(2014, 1, 1), acq_max=date(2014, 12, 31), dataset_types=[DatasetType.ARG25, DatasetType.PQ25]) _log.info("\nFound %d tiles", len(tiles)) stack = get_dataset_data_stack(tiles, DatasetType.ARG25, Ls57Arg25Bands.BLUE.name, ndv=NDV, mask_pqa_apply=True, mask_pqa_mask=[PqaMask.PQ_MASK_CLEAR]) _log.info("\nStack is %s", numpy.shape(stack))
def test_retrieve_data_ls5_arg_with_pqa_water_mask_dry(config=None): filename = "LS5_TM_NBAR_WITH_PQA_WATER_DRY_{x:03d}_{y:04d}_{date}.{x_offset:04d}_{y_offset:04d}.{x_size:04d}x{y_size:04d}.tif".format(x=CELL_X, y=CELL_Y, date=DATE, x_offset=X_OFFSET, y_offset=Y_OFFSET, x_size=X_SIZE, y_size=Y_SIZE) tiles = list_tiles_as_list(x=[CELL_X], y=[CELL_Y], acq_min=ACQ_LS5, acq_max=ACQ_LS5, satellites=[Satellite.LS5], dataset_types=[ARG_DATASET_TYPE, PQ_DATASET_TYPE, WOFS_DATASET_TYPE], config=config) assert len(tiles) == 1 tile = tiles[0] assert ARG_DATASET_TYPE in tile.datasets dataset = tile.datasets[ARG_DATASET_TYPE] assert PQ_DATASET_TYPE in tile.datasets pqa = tile.datasets[PQ_DATASET_TYPE] assert WOFS_DATASET_TYPE in tile.datasets wofs = tile.datasets[WOFS_DATASET_TYPE] mask = get_mask_pqa(pqa, x=X_OFFSET, y=Y_OFFSET, x_size=X_SIZE, y_size=Y_SIZE) mask = get_mask_wofs(wofs, wofs_masks=[WofsMask.DRY, WofsMask.NO_DATA, WofsMask.SATURATION_CONTIGUITY, WofsMask.SEA_WATER, WofsMask.TERRAIN_SHADOW, WofsMask.HIGH_SLOPE, WofsMask.CLOUD_SHADOW, WofsMask.CLOUD], x=X_OFFSET, y=Y_OFFSET, x_size=X_SIZE, y_size=Y_SIZE, mask=mask) data = get_dataset_data_masked(dataset=dataset, x=X_OFFSET, y=Y_OFFSET, x_size=X_SIZE, y_size=Y_SIZE, mask=mask) assert(data) _log.info("data is [%s]\n%s", numpy.shape(data), data) ndv = get_dataset_ndv(dataset) assert(is_ndv(ndv, ARG_NDV)) data_type = get_dataset_datatype(dataset) assert(data_type == ARG_DATA_TYPE) metadata = generate_dataset_metadata(x=CELL_X, y=CELL_Y, acq_dt=ACQ_LS5, dataset=dataset, bands=None, mask_pqa_apply=False, mask_pqa_mask=None, mask_wofs_apply=False, mask_wofs_mask=None) raster_create_geotiff(filename, [data[b] for b in dataset.bands], CELL_GEO_TRANSFORM, CELL_PROJECTION, ndv, data_type, dataset_metadata=metadata, band_ids=[b.name for b in dataset.bands]) assert filecmp.cmp(filename, get_test_data_path(filename))
def get_tiles(x, y, satellites, acq_min, acq_max, season, dataset_type, mask_pqa_apply): acq_min, acq_max, criteria = build_season_date_criteria(acq_min, acq_max, season, seasons=SEASONS, extend=True) dataset_types = [dataset_type] if mask_pqa_apply: dataset_types.append(DatasetType.PQ25) tiles = list_tiles_as_list(x=[x], y=[y], satellites=satellites, acq_min=acq_min, acq_max=acq_max, dataset_types=dataset_types, include=criteria) return tiles
def get_tiles(self): acq_min, acq_max, criteria = build_season_date_criteria(self.acq_min, self.acq_max, self.season, seasons=SEASONS, extend=True) _log.info("\tcriteria is %s", criteria) dataset_types = [self.dataset_type] if self.mask_pqa_apply: dataset_types.append(DatasetType.PQ25) tiles = list_tiles_as_list(x=[self.x], y=[self.y], satellites=self.satellites, acq_min=acq_min, acq_max=acq_max, dataset_types=dataset_types, include=criteria) return tiles
def query_cells(cell_list, satellites, min_date, max_date, dataset_types, output_dir): """ """ base_out_fname = CONFIG.get('outputs', 'query_filename') for cell in cell_list: x_cell = [int(cell[0])] y_cell = [int(cell[1])] tiles = list_tiles_as_list(x=x_cell, y=y_cell, acq_min=min_date, acq_max=max_date, dataset_types=dataset_types, satellites=satellites) out_dir = pjoin(output_dir, '{}_{}'.format(cell[0], cell[1])) out_fname = pjoin(out_dir, base_out_fname) with open(out_fname, 'w') as outf: pickle.dump(tiles, outf)
def test_list_tiles_120_020_2005_ls578(config=None): dataset_types = [DatasetType.ARG25, DatasetType.PQ25, DatasetType.FC25] tiles = list_tiles_as_list(x=[TEST_CELL_X], y=[TEST_CELL_Y], acq_min=parse_date_min(TEST_YEAR_STR), acq_max=parse_date_max(TEST_YEAR_STR), satellites=[Satellite.LS5, Satellite.LS7, Satellite.LS8], dataset_types=dataset_types, config=config) assert(tiles and len(list(tiles)) > 0) for tile in tiles: _log.info("Found tile xy = %s", tile.xy) assert(tile.x == TEST_CELL_X and tile.y == TEST_CELL_Y and tile.xy == (TEST_CELL_X, TEST_CELL_Y) and tile.end_datetime_year == TEST_YEAR and ds in tile.datasets for ds in dataset_types)
def query_cells2(xcells, ycells, satellites, min_date, max_date, dataset_types, output_dir): """ Query the DB for each cell. Currently the config file for this workflow requires the user to specify a rectangular region. I have another workflow that takes a vector file as input. """ base_out_fname = CONFIG.get('outputs', 'query_filename') cell_queries = [] for i in range(len(ycells)): ycell = ycells[i] xcell = xcells[i] # create the output directory #cell_dir = '{}_{}'.format(int(xcell), int(ycell)) cell_dir = str(xcell) + "_" + str(ycell).zfill(4) out_cell_dir = pjoin(output_dir, cell_dir) if not exists(out_cell_dir): os.makedirs(out_cell_dir) tiles = list_tiles_as_list(x=[xcell], y=[ycell], acq_min=min_date, acq_max=max_date, dataset_types=dataset_types, satellites=satellites) fileTile = [] lines = load_filterfile(xcell, ycell) cnt = 0 print "\tlength of original tiles is ", len(tiles) for tile in tiles: #import pdb; pdb.set_trace() cnt = cnt + 1 dataset = tile.datasets[DatasetType.ARG25] tdate = str(tile.end_datetime.strftime("%Y-%m-%d")) if tdate in lines: fileTile.append(tile) out_fname = pjoin(out_cell_dir, base_out_fname) cell_queries.append(out_fname) with open(out_fname, 'w') as outf: pickle.dump(fileTile, outf) print "\tlength of new filtered tiles is %d", len(fileTile) return cell_queries
def test_query(): workflow = Arg25BandStatisticsWorkflow() workflow.x_min = workflow.x_max = TEST_X workflow.y_min = workflow.y_max = TEST_Y workflow.acq_min = parse_date_min("1985") workflow.acq_max = parse_date_max("2014") workflow.epoch = EpochParameter(5, 6) workflow.seasons = Season workflow.seasons = [Season.SUMMER] workflow.satellites = [Satellite.LS5, Satellite.LS7] workflow.mask_pqa_apply = True workflow.mask_pqa_mask = [PqaMask.PQ_MASK_SATURATION, PqaMask.PQ_MASK_CONTIGUITY, PqaMask.PQ_MASK_CLOUD] workflow.dataset_type = DatasetType.ARG25 workflow.bands = Ls57Arg25Bands epochs = list(workflow.get_epochs()) print "" print "epochs are", epochs for season, epoch in product(workflow.seasons, epochs): print season, epoch from datacube.api.utils import build_season_date_criteria acq_min, acq_max, criteria = build_season_date_criteria(epoch[0], epoch[1], season, seasons=SEASONS, extend=True) print acq_min, acq_max, criteria from datacube.api.query import list_tiles_as_list tiles = list_tiles_as_list(x=[workflow.x_min], y=[workflow.y_min], satellites=workflow.satellites, acq_min=acq_min, acq_max=acq_max, dataset_types=[workflow.dataset_type], include=criteria) print "Tiles found is ", len(tiles)
def query_cells2(xcells, ycells, satellites, min_date, max_date, dataset_types, output_dir): """ Query the DB for each cell. Currently the config file for this workflow requires the user to specify a rectangular region. I have another workflow that takes a vector file as input. """ base_out_fname = CONFIG.get('outputs', 'query_filename') cell_queries = [] for i in range(len(ycells)): ycell = ycells[i] xcell = xcells[i] # create the output directory #cell_dir = '{}_{}'.format(int(xcell), int(ycell)) cell_dir = str(xcell) + "_" + str(ycell).zfill(4) out_cell_dir = pjoin(output_dir, cell_dir) if not exists(out_cell_dir): os.makedirs(out_cell_dir) tiles = list_tiles_as_list(x=[xcell], y=[ycell], acq_min=min_date, acq_max=max_date, dataset_types=dataset_types, satellites=satellites) fileTile = [ ] lines = load_filterfile(xcell, ycell) cnt=0 print "\tlength of original tiles is ", len(tiles) for tile in tiles: #import pdb; pdb.set_trace() cnt=cnt+1 dataset = tile.datasets[DatasetType.ARG25] tdate= str(tile.end_datetime.strftime("%Y-%m-%d")) if tdate in lines: fileTile.append(tile) out_fname = pjoin(out_cell_dir, base_out_fname) cell_queries.append(out_fname) with open(out_fname, 'w') as outf: pickle.dump(fileTile, outf) print "\tlength of new filtered tiles is %d", len(fileTile) return cell_queries
def go(self): import numpy from datacube.api.query import list_cells_as_list, list_tiles_as_list from datacube.config import Config x_min, x_max, y_max, y_min = self.extract_bounds_from_vector() _log.debug("The bounds are [%s]", (x_min, x_max, y_min, y_max)) cells_vector = self.extract_cells_from_vector() _log.debug("Intersecting cells_vector are [%d] [%s]", len(cells_vector), cells_vector) config = Config() _log.debug(config.to_str()) x_list = range(x_min, x_max + 1) y_list = range(y_min, y_max + 1) _log.debug("x = [%s] y=[%s]", x_list, y_list) cells_db = list() for cell in list_cells_as_list(x=x_list, y=y_list, acq_min=self.acq_min, acq_max=self.acq_max, satellites=[satellite for satellite in self.satellites], dataset_types=[self.dataset_type]): cells_db.append((cell.x, cell.y)) _log.debug("Cells from DB are [%d] [%s]", len(cells_db), cells_db) cells = intersection(cells_vector, cells_db) _log.debug("Combined cells are [%d] [%s]", len(cells), cells) for (x, y) in cells: _log.info("Processing cell [%3d/%4d]", x, y) tiles = list_tiles_as_list(x=x_list, y=y_list, acq_min=self.acq_min, acq_max=self.acq_max, satellites=[satellite for satellite in self.satellites], dataset_types=[self.dataset_type]) _log.info("There are [%d] tiles", len(tiles)) if self.list_only: for tile in tiles: _log.info("Would process [%s]", tile.datasets[self.dataset_type].path) continue # Calculate the mask for the cell mask_aoi = self.get_mask_aoi_cell(x, y) pixel_count = 4000 * 4000 pixel_count_aoi = (mask_aoi == False).sum() _log.debug("mask_aoi is [%s]\n[%s]", numpy.shape(mask_aoi), mask_aoi) metadata = None with self.get_output_file() as csv_file: csv_writer = csv.writer(csv_file) import operator header = reduce(operator.add, [["DATE", "INSTRUMENT", "# PIXELS", "# PIXELS IN AOI"]] + [ ["%s - # DATA PIXELS" % band_name, "%s - # DATA PIXELS AFTER PQA" % band_name, "%s - # DATA PIXELS AFTER PQA WOFS" % band_name, "%s - # DATA PIXELS AFTER PQA WOFS AOI" % band_name, "%s - MIN" % band_name, "%s - MAX" % band_name, "%s - MEAN" % band_name] for band_name in self.bands]) csv_writer.writerow(header) for tile in tiles: _log.info("Processing tile [%s]", tile.datasets[self.dataset_type].path) if self.list_only: continue if not metadata: metadata = get_dataset_metadata(tile.datasets[self.dataset_type]) # Apply PQA if specified pqa = None mask_pqa = None if self.mask_pqa_apply and DatasetType.PQ25 in tile.datasets: pqa = tile.datasets[DatasetType.PQ25] mask_pqa = get_mask_pqa(pqa, self.mask_pqa_mask) _log.debug("mask_pqa is [%s]\n[%s]", numpy.shape(mask_pqa), mask_pqa) # Apply WOFS if specified wofs = None mask_wofs = None if self.mask_wofs_apply and DatasetType.WATER in tile.datasets: wofs = tile.datasets[DatasetType.WATER] mask_wofs = get_mask_wofs(wofs, self.mask_wofs_mask) _log.debug("mask_wofs is [%s]\n[%s]", numpy.shape(mask_wofs), mask_wofs) dataset = tile.datasets[self.dataset_type] bands = [] dataset_band_names = [b.name for b in dataset.bands] for b in self.bands: if b in dataset_band_names: bands.append(dataset.bands[b]) data = get_dataset_data(tile.datasets[self.dataset_type], bands=bands) _log.debug("data is [%s]\n[%s]", numpy.shape(data), data) pixel_count_data = dict() pixel_count_data_pqa = dict() pixel_count_data_pqa_wofs = dict() pixel_count_data_pqa_wofs_aoi = dict() mmin = dict() mmax = dict() mmean = dict() for band_name in self.bands: # Add "zeroed" entries for non-present bands - should only be if outputs for those bands have been explicitly requested if band_name not in dataset_band_names: pixel_count_data[band_name] = 0 pixel_count_data_pqa[band_name] = 0 pixel_count_data_pqa_wofs[band_name] = 0 pixel_count_data_pqa_wofs_aoi[band_name] = 0 mmin[band_name] = numpy.ma.masked mmax[band_name] = numpy.ma.masked mmean[band_name] = numpy.ma.masked continue band = dataset.bands[band_name] data[band] = numpy.ma.masked_equal(data[band], NDV) _log.debug("masked data is [%s] [%d]\n[%s]", numpy.shape(data), numpy.ma.count(data), data) pixel_count_data[band_name] = numpy.ma.count(data[band]) if pqa: data[band].mask = numpy.ma.mask_or(data[band].mask, mask_pqa) _log.debug("PQA masked data is [%s] [%d]\n[%s]", numpy.shape(data[band]), numpy.ma.count(data[band]), data[band]) pixel_count_data_pqa[band_name] = numpy.ma.count(data[band]) if wofs: data[band].mask = numpy.ma.mask_or(data[band].mask, mask_wofs) _log.debug("WOFS masked data is [%s] [%d]\n[%s]", numpy.shape(data[band]), numpy.ma.count(data[band]), data[band]) pixel_count_data_pqa_wofs[band_name] = numpy.ma.count(data[band]) data[band].mask = numpy.ma.mask_or(data[band].mask, mask_aoi) _log.debug("AOI masked data is [%s] [%d]\n[%s]", numpy.shape(data[band]), numpy.ma.count(data[band]), data[band]) pixel_count_data_pqa_wofs_aoi[band_name] = numpy.ma.count(data[band]) mmin[band_name] = numpy.ma.min(data[band]) mmax[band_name] = numpy.ma.max(data[band]) mmean[band_name] = numpy.ma.mean(data[band]) # Convert the mean to an int...taking into account masking.... if not numpy.ma.is_masked(mmean[band_name]): mmean[band_name] = mmean[band_name].astype(numpy.int16) pixel_count_data_pqa_wofs_aoi_all_bands = reduce(operator.add, pixel_count_data_pqa_wofs_aoi.itervalues()) if pixel_count_data_pqa_wofs_aoi_all_bands == 0 and not self.output_no_data: _log.info("Skipping dataset with no non-masked data values in ANY band") continue row = reduce( operator.add, [[tile.end_datetime, self.decode_satellite_as_instrument(tile.datasets[self.dataset_type].satellite), pixel_count, pixel_count_aoi]] + [[pixel_count_data[band_name], pixel_count_data_pqa[band_name], pixel_count_data_pqa_wofs[band_name], pixel_count_data_pqa_wofs_aoi[band_name], mmin[band_name], mmax[band_name], mmean[band_name]] for band_name in self.bands]) csv_writer.writerow(row)
def run(self): _log.info("Creating stack for band [%s]", self.band.name) data_type = get_dataset_type_datatype(self.dataset_type) ndv = get_dataset_type_ndv(self.dataset_type) metadata = None driver = None raster = None acq_min, acq_max, criteria = build_season_date_criteria(self.acq_min, self.acq_max, self.season, seasons=SEASONS, extend=True) _log.info("\tacq %s to %s criteria is %s", acq_min, acq_max, criteria) dataset_types = [self.dataset_type] if self.mask_pqa_apply: dataset_types.append(DatasetType.PQ25) tiles = list_tiles_as_list(x=[self.x], y=[self.y], satellites=self.satellites, acq_min=acq_min, acq_max=acq_max, dataset_types=dataset_types, include=criteria) for index, tile in enumerate(tiles, start=1): dataset = tile.datasets[self.dataset_type] assert dataset # band = dataset.bands[self.band] # assert band band = self.band pqa = (self.mask_pqa_apply and DatasetType.PQ25 in tile.datasets) and tile.datasets[DatasetType.PQ25] or None if self.dataset_type not in tile.datasets: _log.debug("No [%s] dataset present for [%s] - skipping", self.dataset_type.name, tile.end_datetime) continue filename = self.output().path if not metadata: metadata = get_dataset_metadata(dataset) assert metadata if not driver: if self.output_format == OutputFormat.GEOTIFF: driver = gdal.GetDriverByName("GTiff") elif self.output_format == OutputFormat.ENVI: driver = gdal.GetDriverByName("ENVI") assert driver if not raster: if self.output_format == OutputFormat.GEOTIFF: raster = driver.Create(filename, metadata.shape[0], metadata.shape[1], len(tiles), data_type, options=["BIGTIFF=YES", "INTERLEAVE=BAND"]) elif self.output_format == OutputFormat.ENVI: raster = driver.Create(filename, metadata.shape[0], metadata.shape[1], len(tiles), data_type, options=["INTERLEAVE=BSQ"]) assert raster # NOTE: could do this without the metadata!! raster.SetGeoTransform(metadata.transform) raster.SetProjection(metadata.projection) raster.SetMetadata(self.generate_raster_metadata()) mask = None if pqa: mask = get_mask_pqa(pqa, self.mask_pqa_mask, mask=mask) _log.info("Stacking [%s] band data from [%s] with PQA [%s] and PQA mask [%s] to [%s]", band.name, dataset.path, pqa and pqa.path or "", pqa and self.mask_pqa_mask or "", filename) data = get_dataset_data_masked(dataset, mask=mask, ndv=ndv) _log.debug("data is [%s]", data) stack_band = raster.GetRasterBand(index) stack_band.SetDescription(os.path.basename(dataset.path)) stack_band.SetNoDataValue(ndv) stack_band.WriteArray(data[band]) stack_band.ComputeStatistics(True) stack_band.SetMetadata({"ACQ_DATE": format_date(tile.end_datetime), "SATELLITE": dataset.satellite.name}) stack_band.FlushCache() del stack_band if raster: raster.FlushCache() del raster raster = None
if __name__ == '__main__': logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') config = Config() satellites = [Satellite(i) for i in ['LS5', 'LS7', 'LS8']] min_date = date(1995, 01, 01) max_date = date(2015, 12, 31) ds_type = DatasetType.ARG25 x_cell = [146] y_cell = [-34] tiles = list_tiles_as_list(x=x_cell, y=y_cell, acq_min=min_date, acq_max=max_date, satellites=satellites, datasets=ds_type, database=config.get_db_database(), user=config.get_db_username(), password=config.get_db_password(), host=config.get_db_host(), port=config.get_db_port()) #bs_workflow(tiles, outdir='/g/data/v10/testing_ground/jps547/percentiles') #bs_workflow(tiles, outdir='/g/data/v10/testing_ground/jps547/percentiles/pct_95', percentile=95) tidal_workflow(tiles, outdir='/g/data2/v10/ARG25-tidal-analysis/test', percentile=10)
def run(self): _log.info("Creating stack for band [%s]", self.band.name) data_type = get_dataset_type_datatype(self.dataset_type) ndv = get_dataset_type_ndv(self.dataset_type) metadata = None driver = None raster = None acq_min, acq_max, criteria = build_season_date_criteria( self.acq_min, self.acq_max, self.season, seasons=SEASONS, extend=True) _log.info("\tacq %s to %s criteria is %s", acq_min, acq_max, criteria) dataset_types = [self.dataset_type] if self.mask_pqa_apply: dataset_types.append(DatasetType.PQ25) tiles = list_tiles_as_list(x=[self.x], y=[self.y], satellites=self.satellites, acq_min=acq_min, acq_max=acq_max, dataset_types=dataset_types, include=criteria) for index, tile in enumerate(tiles, start=1): dataset = tile.datasets[self.dataset_type] assert dataset # band = dataset.bands[self.band] # assert band band = self.band pqa = (self.mask_pqa_apply and DatasetType.PQ25 in tile.datasets ) and tile.datasets[DatasetType.PQ25] or None if self.dataset_type not in tile.datasets: _log.debug("No [%s] dataset present for [%s] - skipping", self.dataset_type.name, tile.end_datetime) continue filename = self.output().path if not metadata: metadata = get_dataset_metadata(dataset) assert metadata if not driver: if self.output_format == OutputFormat.GEOTIFF: driver = gdal.GetDriverByName("GTiff") elif self.output_format == OutputFormat.ENVI: driver = gdal.GetDriverByName("ENVI") assert driver if not raster: if self.output_format == OutputFormat.GEOTIFF: raster = driver.Create( filename, metadata.shape[0], metadata.shape[1], len(tiles), data_type, options=["BIGTIFF=YES", "INTERLEAVE=BAND"]) elif self.output_format == OutputFormat.ENVI: raster = driver.Create(filename, metadata.shape[0], metadata.shape[1], len(tiles), data_type, options=["INTERLEAVE=BSQ"]) assert raster # NOTE: could do this without the metadata!! raster.SetGeoTransform(metadata.transform) raster.SetProjection(metadata.projection) raster.SetMetadata(self.generate_raster_metadata()) mask = None if pqa: mask = get_mask_pqa(pqa, self.mask_pqa_mask, mask=mask) _log.info( "Stacking [%s] band data from [%s] with PQA [%s] and PQA mask [%s] to [%s]", band.name, dataset.path, pqa and pqa.path or "", pqa and self.mask_pqa_mask or "", filename) data = get_dataset_data_masked(dataset, mask=mask, ndv=ndv) _log.debug("data is [%s]", data) stack_band = raster.GetRasterBand(index) stack_band.SetDescription(os.path.basename(dataset.path)) stack_band.SetNoDataValue(ndv) stack_band.WriteArray(data[band]) stack_band.ComputeStatistics(True) stack_band.SetMetadata({ "ACQ_DATE": format_date(tile.end_datetime), "SATELLITE": dataset.satellite.name }) stack_band.FlushCache() del stack_band if raster: raster.FlushCache() del raster raster = None
# Close the output files nbar_outds.close() all_outds.close() if __name__ == '__main__': logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') config = Config() #satellites = [Satellite(i) for i in ['LS5', 'LS7', 'LS8']] satellites = [Satellite(i) for i in ['LS5', 'LS7']] min_date = date(1987, 01, 01) max_date = date(2015, 12, 31) ds_type = DatasetType.ARG25 x_cell = [146] y_cell = [-34] tiles = list_tiles_as_list(x=x_cell, y=y_cell, acq_min=min_date, acq_max=max_date, satellites=satellites, datasets=ds_type, database=config.get_db_database(), user=config.get_db_username(), password=config.get_db_password(), host=config.get_db_host(), port=config.get_db_port()) #bs_workflow(tiles, outdir='/g/data/v10/testing_ground/jps547/percentiles') #bs_workflow(tiles, outdir='/g/data/v10/testing_ground/jps547/percentiles/pct_95', percentile=95) bs_workflow(tiles, outdir='/g/data/v10/testing_ground/jps547/percentiles/test', percentile=95)
def run(self): self.parse_arguments() config = Config() _log.debug(config.to_str()) path = self.get_output_filename(self.dataset_type) _log.info("Output file is [%s]", path) if os.path.exists(path): if self.overwrite: _log.info("Removing existing output file [%s]", path) os.remove(path) else: _log.error("Output file [%s] exists", path) raise Exception("Output file [%s] already exists" % path) # TODO bands = get_bands(self.dataset_type, self.satellites[0]) # TODO once WOFS is in the cube tiles = list_tiles_as_list(x=[self.x], y=[self.y], acq_min=self.acq_min, acq_max=self.acq_max, satellites=[satellite for satellite in self.satellites], dataset_types=[self.dataset_type], database=config.get_db_database(), user=config.get_db_username(), password=config.get_db_password(), host=config.get_db_host(), port=config.get_db_port()) raster = None metadata = None # TODO - PQ is UNIT16 (others are INT16) and so -999 NDV doesn't work ndv = self.dataset_type == DatasetType.PQ25 and UINT16_MAX or NDV _log.debug("Current MAX RSS usage is [%d] MB", resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024) import itertools for x, y in itertools.product(range(0, 4000, self.chunk_size_x), range(0, 4000, self.chunk_size_y)): _log.info("About to read data chunk ({xmin:4d},{ymin:4d}) to ({xmax:4d},{ymax:4d})".format(xmin=x, ymin=y, xmax=x+self.chunk_size_x-1, ymax=y+self.chunk_size_y-1)) _log.debug("Current MAX RSS usage is [%d] MB", resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024) stack = dict() for tile in tiles: if self.list_only: _log.info("Would summarise dataset [%s]", tile.datasets[self.dataset_type].path) continue pqa = None _log.debug("Reading dataset [%s]", tile.datasets[self.dataset_type].path) if not metadata: metadata = get_dataset_metadata(tile.datasets[self.dataset_type]) # Apply PQA if specified if self.apply_pqa_filter: data = get_dataset_data_with_pq(tile.datasets[self.dataset_type], tile.datasets[DatasetType.PQ25], bands=bands, x=x, y=y, x_size=self.chunk_size_x, y_size=self.chunk_size_y, pq_masks=self.pqa_mask, ndv=ndv) else: data = get_dataset_data(tile.datasets[self.dataset_type], bands=bands, x=x, y=y, x_size=self.chunk_size_x, y_size=self.chunk_size_y) for band in bands: if band in stack: stack[band].append(data[band]) else: stack[band] = [data[band]] _log.debug("data[%s] has shape [%s] and MB [%s]", band.name, numpy.shape(data[band]), data[band].nbytes/1000/1000) _log.debug("stack[%s] has [%s] elements", band.name, len(stack[band])) # Apply summary method _log.info("Finished reading {count} datasets for chunk ({xmin:4d},{ymin:4d}) to ({xmax:4d},{ymax:4d}) - about to summarise them".format(count=len(tiles), xmin=x, ymin=y, xmax=x+self.chunk_size_x-1, ymax=y+self.chunk_size_y-1)) _log.debug("Current MAX RSS usage is [%d] MB", resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024) masked_stack = dict() for band in bands: masked_stack[band] = numpy.ma.masked_equal(stack[band], ndv) _log.debug("masked_stack[%s] is %s", band.name, masked_stack[band]) _log.debug("masked stack[%s] has shape [%s] and MB [%s]", band.name, numpy.shape(masked_stack[band]), masked_stack[band].nbytes/1000/1000) _log.debug("Current MAX RSS usage is [%d] MB", resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024) if self.summary_method == TimeSeriesSummaryMethod.MIN: masked_summary = numpy.min(masked_stack[band], axis=0) elif self.summary_method == TimeSeriesSummaryMethod.MAX: masked_summary = numpy.max(masked_stack[band], axis=0) elif self.summary_method == TimeSeriesSummaryMethod.MEAN: masked_summary = numpy.mean(masked_stack[band], axis=0) elif self.summary_method == TimeSeriesSummaryMethod.MEDIAN: masked_summary = numpy.median(masked_stack[band], axis=0) # aka 50th percentile elif self.summary_method == TimeSeriesSummaryMethod.MEDIAN_NON_INTERPOLATED: masked_sorted = numpy.ma.sort(masked_stack[band], axis=0) masked_percentile_index = numpy.ma.floor(numpy.ma.count(masked_sorted, axis=0) * 0.95).astype(numpy.int16) masked_summary = numpy.ma.choose(masked_percentile_index, masked_sorted) elif self.summary_method == TimeSeriesSummaryMethod.COUNT: # TODO Need to artificially create masked array here since it is being expected/filled below!!! masked_summary = numpy.ma.masked_equal(masked_stack[band].count(axis=0), ndv) elif self.summary_method == TimeSeriesSummaryMethod.SUM: masked_summary = numpy.sum(masked_stack[band], axis=0) elif self.summary_method == TimeSeriesSummaryMethod.STANDARD_DEVIATION: masked_summary = numpy.std(masked_stack[band], axis=0) elif self.summary_method == TimeSeriesSummaryMethod.VARIANCE: masked_summary = numpy.var(masked_stack[band], axis=0) # currently 95th percentile elif self.summary_method == TimeSeriesSummaryMethod.PERCENTILE: masked_sorted = numpy.ma.sort(masked_stack[band], axis=0) masked_percentile_index = numpy.ma.floor(numpy.ma.count(masked_sorted, axis=0) * 0.95).astype(numpy.int16) masked_summary = numpy.ma.choose(masked_percentile_index, masked_sorted) elif self.summary_method == TimeSeriesSummaryMethod.YOUNGEST_PIXEL: # TODO the fact that this is band at a time might be problematic. We really should be considering # all bands at once (that is what the landsat_mosaic logic did). If PQA is being applied then # it's probably all good but if not then we might get odd results.... masked_summary = empty_array(shape=(self.chunk_size_x, self.chunk_size_x), dtype=numpy.int16, ndv=ndv) # Note the reversed as the stack is created oldest first for d in reversed(stack[band]): masked_summary = numpy.where(masked_summary == ndv, d, masked_summary) # If the summary doesn't contain an no data values then we can stop if not numpy.any(masked_summary == ndv): break # TODO Need to artificially create masked array here since it is being expected/filled below!!! masked_summary = numpy.ma.masked_equal(masked_summary, ndv) elif self.summary_method == TimeSeriesSummaryMethod.OLDEST_PIXEL: # TODO the fact that this is band at a time might be problematic. We really should be considering # all bands at once (that is what the landsat_mosaic logic did). If PQA is being applied then # it's probably all good but if not then we might get odd results.... masked_summary = empty_array(shape=(self.chunk_size_x, self.chunk_size_x), dtype=numpy.int16, ndv=ndv) # Note the NOT reversed as the stack is created oldest first for d in stack[band]: masked_summary = numpy.where(masked_summary == ndv, d, masked_summary) # If the summary doesn't contain an no data values then we can stop if not numpy.any(masked_summary == ndv): break # TODO Need to artificially create masked array here since it is being expected/filled below!!! masked_summary = numpy.ma.masked_equal(masked_summary, ndv) masked_stack[band] = None _log.debug("NONE-ing masked stack[%s]", band.name) _log.debug("Current MAX RSS usage is [%d] MB", resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024) _log.debug("masked summary is [%s]", masked_summary) _log.debug("Current MAX RSS usage is [%d] MB", resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024) # Create the output file if not os.path.exists(path): _log.info("Creating raster [%s]", path) driver = gdal.GetDriverByName("GTiff") assert driver raster = driver.Create(path, metadata.shape[0], metadata.shape[1], len(bands), gdal.GDT_Int16) assert raster raster.SetGeoTransform(metadata.transform) raster.SetProjection(metadata.projection) for b in bands: raster.GetRasterBand(b.value).SetNoDataValue(ndv) _log.info("Writing band [%s] data to raster [%s]", band.name, path) _log.debug("Current MAX RSS usage is [%d] MB", resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024) raster.GetRasterBand(band.value).WriteArray(masked_summary.filled(ndv), xoff=x, yoff=y) raster.GetRasterBand(band.value).ComputeStatistics(True) raster.FlushCache() masked_summary = None _log.debug("NONE-ing the masked summary") _log.debug("Current MAX RSS usage is [%d] MB", resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024) stack = None _log.debug("Just NONE-ed the stack") _log.debug("Current MAX RSS usage is [%d] MB", resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024) raster = None _log.debug("Just NONE'd the raster") _log.debug("Current MAX RSS usage is [%d] MB", resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024) _log.info("Memory usage was [%d MB]", resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024) _log.info("CPU time used [%s]", timedelta(seconds=int(resource.getrusage(resource.RUSAGE_SELF).ru_utime)))
def go(self): import numpy from datacube.api.query import list_cells_as_list, list_tiles_as_list from datacube.config import Config # Verify that all the requested satellites have the same band combinations dataset_bands = get_bands(self.dataset_type, self.satellites[0]) _log.info("dataset bands is [%s]", " ".join([b.name for b in dataset_bands])) for satellite in self.satellites: if dataset_bands != get_bands(self.dataset_type, satellite): _log.error("Satellites [%s] have differing bands", " ".join([satellite.name for satellite in self.satellites])) raise Exception("Satellites with different band combinations selected") bands = [] dataset_bands_list = list(dataset_bands) if not self.bands: bands = dataset_bands_list else: for b in self.bands: bands.append(dataset_bands_list[b - 1]) _log.info("Using bands [%s]", " ".join(band.name for band in bands)) x_min, x_max, y_max, y_min = self.extract_bounds_from_vector() _log.debug("The bounds are [%s]", (x_min, x_max, y_min, y_max)) cells_vector = self.extract_cells_from_vector() _log.debug("Intersecting cells_vector are [%d] [%s]", len(cells_vector), cells_vector) config = Config(os.path.expanduser("~/.datacube/config")) _log.debug(config.to_str()) x_list = range(x_min, x_max + 1) y_list = range(y_min, y_max + 1) _log.debug("x = [%s] y=[%s]", x_list, y_list) cells_db = list() for cell in list_cells_as_list(x=x_list, y=y_list, acq_min=self.acq_min, acq_max=self.acq_max, satellites=[satellite for satellite in self.satellites], dataset_types=[self.dataset_type]): cells_db.append((cell.x, cell.y)) _log.debug("Cells from DB are [%d] [%s]", len(cells_db), cells_db) cells = intersection(cells_vector, cells_db) _log.debug("Combined cells are [%d] [%s]", len(cells), cells) for (x, y) in cells: _log.info("Processing cell [%3d/%4d]", x, y) tiles = list_tiles_as_list(x=x_list, y=y_list, acq_min=self.acq_min, acq_max=self.acq_max, satellites=[satellite for satellite in self.satellites], dataset_types=[self.dataset_type]) _log.info("There are [%d] tiles", len(tiles)) if self.list_only: for tile in tiles: _log.info("Would process [%s]", tile.datasets[self.dataset_type].path) continue # Calculate the mask for the cell mask_aoi = self.get_mask_aoi_cell(x, y) pixel_count = 4000 * 4000 pixel_count_aoi = (mask_aoi == False).sum() _log.debug("mask_aoi is [%s]\n[%s]", numpy.shape(mask_aoi), mask_aoi) metadata = None with self.get_output_file() as csv_file: csv_writer = csv.writer(csv_file) import operator header = reduce(operator.add, [["DATE", "INSTRUMENT", "# PIXELS", "# PIXELS IN AOI"]] + [ ["%s - # DATA PIXELS" % b.name, "%s - # DATA PIXELS AFTER PQA" % b.name, "%s - # DATA PIXELS AFTER PQA WOFS" % b.name, "%s - # DATA PIXELS AFTER PQA WOFS AOI" % b.name, "%s - MIN" % b.name, "%s - MAX" % b.name, "%s - MEAN" % b.name] for b in bands]) csv_writer.writerow(header) for tile in tiles: _log.info("Processing tile [%s]", tile.datasets[self.dataset_type].path) if self.list_only: continue if not metadata: metadata = get_dataset_metadata(tile.datasets[self.dataset_type]) # Apply PQA if specified pqa = None mask_pqa = None if self.mask_pqa_apply and DatasetType.PQ25 in tile.datasets: pqa = tile.datasets[DatasetType.PQ25] mask_pqa = get_mask_pqa(pqa, self.mask_pqa_mask) _log.debug("mask_pqa is [%s]\n[%s]", numpy.shape(mask_pqa), mask_pqa) # Apply WOFS if specified wofs = None mask_wofs = None if self.mask_wofs_apply and DatasetType.WATER in tile.datasets: wofs = tile.datasets[DatasetType.WATER] mask_wofs = get_mask_wofs(wofs, self.mask_wofs_mask) _log.debug("mask_wofs is [%s]\n[%s]", numpy.shape(mask_wofs), mask_wofs) data = get_dataset_data(tile.datasets[self.dataset_type], bands=bands) _log.debug("data is [%s]\n[%s]", numpy.shape(data), data) pixel_count_data = dict() pixel_count_data_pqa = dict() pixel_count_data_pqa_wofs = dict() pixel_count_data_pqa_wofs_aoi = dict() mmin = dict() mmax = dict() mmean = dict() for band in bands: data[band] = numpy.ma.masked_equal(data[band], NDV) _log.debug("masked data is [%s] [%d]\n[%s]", numpy.shape(data), numpy.ma.count(data), data) pixel_count_data[band] = numpy.ma.count(data[band]) if pqa: data[band].mask = numpy.ma.mask_or(data[band].mask, mask_pqa) _log.debug("PQA masked data is [%s] [%d]\n[%s]", numpy.shape(data[band]), numpy.ma.count(data[band]), data[band]) pixel_count_data_pqa[band] = numpy.ma.count(data[band]) if wofs: data[band].mask = numpy.ma.mask_or(data[band].mask, mask_wofs) _log.debug("WOFS masked data is [%s] [%d]\n[%s]", numpy.shape(data[band]), numpy.ma.count(data[band]), data[band]) pixel_count_data_pqa_wofs[band] = numpy.ma.count(data[band]) data[band].mask = numpy.ma.mask_or(data[band].mask, mask_aoi) _log.debug("AOI masked data is [%s] [%d]\n[%s]", numpy.shape(data[band]), numpy.ma.count(data[band]), data[band]) pixel_count_data_pqa_wofs_aoi[band] = numpy.ma.count(data[band]) mmin[band] = numpy.ma.min(data[band]) mmax[band] = numpy.ma.max(data[band]) mmean[band] = numpy.ma.mean(data[band]) # Convert the mean to an int...which is actually trickier than you would expect due to masking.... if numpy.ma.count(mmean[band]) != 0: mmean[band] = mmean[band].astype(numpy.int16) # Should we output if no data values found? pixel_count_data_pqa_wofs_aoi_all_bands = reduce(operator.add, pixel_count_data_pqa_wofs_aoi.itervalues()) if pixel_count_data_pqa_wofs_aoi_all_bands == 0 and not self.output_no_data: _log.info("Skipping dataset with no non-masked data values in ANY band") continue row = reduce( operator.add, [[tile.end_datetime, self.decode_satellite_as_instrument(tile.datasets[self.dataset_type].satellite), pixel_count, pixel_count_aoi]] + [[pixel_count_data[band], pixel_count_data_pqa[band], pixel_count_data_pqa_wofs[band], pixel_count_data_pqa_wofs_aoi[band], mmin[band], mmax[band], mmean[band]] for band in bands]) csv_writer.writerow(row)