def file_pyramid(pyramid_factory): opensearch_endpoint = layer_source_info.get( 'opensearch_endpoint', ConfigParams().default_opensearch_endpoint) opensearch_collection_id = layer_source_info[ 'opensearch_collection_id'] opensearch_link_titles = metadata.opensearch_link_titles root_path = layer_source_info['root_path'] factory = pyramid_factory(opensearch_endpoint, opensearch_collection_id, opensearch_link_titles, root_path) if single_level: #TODO EP-3561 UTM is not always the native projection of a layer (PROBA-V), need to determine optimal projection return factory.datacube_seq(projected_polygons_native_crs, from_date, to_date, metadata_properties(), correlation_id, datacubeParams) else: if geometries: return factory.pyramid_seq(projected_polygons.polygons(), projected_polygons.crs(), from_date, to_date, metadata_properties(), correlation_id) else: return factory.pyramid_seq(extent, srs, from_date, to_date, metadata_properties(), correlation_id)
def __init__(self): master_str = "local[*]" conf = geopyspark_conf(master=master_str, appName="test") conf.set('spark.kryoserializer.buffer.max', value='1G') conf.set('spark.ui.enabled', True) if ConfigParams().is_ci_context: conf.set(key='spark.driver.memory', value='2G') conf.set(key='spark.executor.memory', value='2G') self.pysc = SparkContext.getOrCreate(conf) self.first = np.zeros((1, 4, 4)) self.first.fill(1) self.second = np.zeros((1, 4, 4)) self.second.fill(2) self.extent = {'xmin': 0.0, 'ymin': 0.0, 'xmax': 4.0, 'ymax': 4.0} self.layout = { 'layoutCols': 1, 'layoutRows': 1, 'tileCols': 4, 'tileRows': 4 } self.now = datetime.datetime.strptime("2017-09-25T11:37:00Z", '%Y-%m-%dT%H:%M:%SZ')
def _schedule_task(task_id: str, arguments: dict): task = { 'task_id': task_id, 'arguments': arguments } env = ConfigParams().async_task_handler_environment def encode(s: str) -> bytes: return s.encode('utf-8') producer = KafkaProducer( bootstrap_servers="epod-master1.vgt.vito.be:6668,epod-master2.vgt.vito.be:6668,epod-master3.vgt.vito.be:6668", security_protocol='PLAINTEXT', acks='all' ) try: task_message = json.dumps(task) producer.send(topic="openeo-async-tasks", value=encode(task_message), headers=[('env', encode(env))] if env else None).get(timeout=120) _log.info(f"scheduled task {task_message} on env {env}") finally: producer.close()
def __init__(self): self._root = '/openeo/services' self._hosts = ','.join(ConfigParams().zookeepernodes) with self._zk_client() as zk: zk.ensure_path(self._root) # Additional in memory storage of server instances that were registered in current process. self._services = {}
def main(): repo = ZooKeeperUserDefinedProcessRepository( hosts=ConfigParams().zookeepernodes) user_id = 'vdboschj' process_graph_id = 'evi' udp_spec = { 'id': process_graph_id, 'process_graph': { 'loadcollection1': { 'process_id': 'load_collection', 'arguments': { 'id': 'PROBAV_L3_S10_TOC_NDVI_333M' } } } } repo.save(user_id=user_id, process_id=process_graph_id, spec=udp_spec) udps = repo.get_for_user(user_id) for udp in udps: print(udp) repo.delete(user_id, process_graph_id) print(repo.get(user_id, process_graph_id))
def main(): _log.info("ConfigParams(): {c}".format(c=ConfigParams())) parser = argparse.ArgumentParser( usage="OpenEO Cleaner", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--py4j-jarpath", default="venv/share/py4j/py4j0.10.7.jar", help='Path to the Py4J jar') parser.add_argument("--py4j-classpath", default="geotrellis-extensions-2.2.0-SNAPSHOT.jar", help='Classpath used to launch the Java Gateway') args = parser.parse_args() java_opts = [ "-client", "-Dsoftware.amazon.awssdk.http.service.impl=software.amazon.awssdk.http.urlconnection.UrlConnectionSdkHttpService" ] java_gateway = JavaGateway.launch_gateway(jarpath=args.py4j_jarpath, classpath=args.py4j_classpath, javaopts=java_opts, die_on_exit=True) max_date = datetime.today() - timedelta(days=60) remove_batch_jobs_before(max_date, java_gateway.jvm) remove_secondary_services_before(max_date)
def file_probav_pyramid(): opensearch_endpoint = layer_source_info.get( 'opensearch_endpoint', ConfigParams().default_opensearch_endpoint) return jvm.org.openeo.geotrellis.file.ProbaVPyramidFactory(opensearch_endpoint, layer_source_info.get('opensearch_collection_id'), layer_source_info.get('root_path'),jvm.geotrellis.raster.CellSize(cell_width, cell_height)) \ .pyramid_seq(extent, srs, from_date, to_date, band_indices, correlation_id)
def setup_batch_jobs(): if not ConfigParams().is_ci_context: with JobRegistry() as job_registry: job_registry.ensure_paths() job_tracker = JobTracker(JobRegistry, principal="", keytab="") threading.Thread(target=job_tracker.loop_update_statuses, daemon=True).start()
def zk_client(hosts: str = ','.join(ConfigParams().zookeepernodes)): zk = KazooClient(hosts) zk.start() try: yield zk finally: zk.stop() zk.close()
def _create_job_dir(job_dir: Path): logger.info("creating job dir {j!r} (parent dir: {p}))".format( j=job_dir, p=describe_path(job_dir.parent))) ensure_dir(job_dir) if not ConfigParams().is_kube_deploy: shutil.chown(job_dir, user=None, group='eodata') _add_permissions(job_dir, stat.S_ISGID | stat.S_IWGRP) # make children inherit this group
def __init__(self): # TODO: do this with a config instead of hardcoding rules? self._service_registry = (InMemoryServiceRegistry() if ConfigParams().is_ci_context else ZooKeeperServiceRegistry()) super().__init__( secondary_services=GpsSecondaryServices( service_registry=self._service_registry), catalog=get_layer_catalog(service_registry=self._service_registry), batch_jobs=GpsBatchJobs(), )
def accumulo_pyramid(): pyramidFactory = jvm.org.openeo.geotrellisaccumulo.PyramidFactory( "hdp-accumulo-instance", ','.join(ConfigParams().zookeepernodes)) if layer_source_info.get("split", False): pyramidFactory.setSplitRanges(True) accumulo_layer_name = layer_source_info['data_id'] nonlocal still_needs_band_filter still_needs_band_filter = bool(band_indices) return pyramidFactory.pyramid_seq(accumulo_layer_name, extent, srs, from_date, to_date)
def update_zookeeper(host: str, port: int, env: str) -> None: from kazoo.client import KazooClient from openeogeotrellis.configparams import ConfigParams cluster_id = 'openeo-' + env zk = KazooClient(hosts=','.join(ConfigParams().zookeepernodes)) zk.start() try: Traefik(zk).add_load_balanced_server(cluster_id=cluster_id, server_id="0", host=host, port=port, environment=env) finally: zk.stop() zk.close()
def accumulo_pyramid(): pyramidFactory = jvm.org.openeo.geotrellisaccumulo.PyramidFactory( "hdp-accumulo-instance", ','.join(ConfigParams().zookeepernodes)) if layer_source_info.get("split", False): pyramidFactory.setSplitRanges(True) accumulo_layer_name = layer_source_info['data_id'] nonlocal still_needs_band_filter still_needs_band_filter = bool(band_indices) polygons = load_params.aggregate_spatial_geometries if polygons: projected_polygons = to_projected_polygons(jvm, polygons) return pyramidFactory.pyramid_seq( accumulo_layer_name, projected_polygons.polygons(), projected_polygons.crs(), from_date, to_date) else: return pyramidFactory.pyramid_seq(accumulo_layer_name, extent, srs, from_date, to_date)
def get_layer_catalog( service_registry: AbstractServiceRegistry = None ) -> GeoPySparkLayerCatalog: """ Get layer catalog (from JSON files) """ catalog_files = ConfigParams().layer_catalog_metadata_files logger.info( "Reading layer catalog metadata from {f!r}".format(f=catalog_files[0])) metadata = read_json(catalog_files[0]) if len(catalog_files) > 1: # Merge metadata recursively metadata = {l["id"]: l for l in metadata} for path in catalog_files[1:]: logger.info( "Updating layer catalog metadata from {f!r}".format(f=path)) updates = {l["id"]: l for l in read_json(path)} metadata = dict_merge_recursive(metadata, updates, overwrite=True) metadata = list(metadata.values()) return GeoPySparkLayerCatalog(all_metadata=metadata, service_registry=service_registry or InMemoryServiceRegistry())
def update_statuses(self) -> None: with self._job_registry() as registry: registry.ensure_paths() jobs_to_track = registry.get_running_jobs() for job_info in jobs_to_track: try: job_id, user_id = job_info['job_id'], job_info['user_id'] application_id, current_status = job_info[ 'application_id'], job_info['status'] if application_id: try: if ConfigParams().is_kube_deploy: from openeogeotrellis.utils import s3_client, download_s3_dir state, start_time, finish_time = JobTracker._kube_status( job_id, user_id) new_status = JobTracker._kube_status_parser( state) registry.patch(job_id, user_id, status=new_status, started=start_time, finished=finish_time) if current_status != new_status: _log.info( "changed job %s status from %s to %s" % (job_id, current_status, new_status), extra={'job_id': job_id}) if state == "COMPLETED": # TODO: do we support SHub batch processes in this environment? The AWS # credentials conflict. download_s3_dir( "OpenEO-data", "batch_jobs/{j}".format(j=job_id)) result_metadata = self._batch_jobs.get_results_metadata( job_id, user_id) registry.patch(job_id, user_id, **result_metadata) registry.mark_done(job_id, user_id) _log.info("marked %s as done" % job_id, extra={'job_id': job_id}) else: state, final_state, start_time, finish_time, aggregate_resource_allocation =\ JobTracker._yarn_status(application_id) memory_time_megabyte_seconds, cpu_time_seconds =\ JobTracker._parse_resource_allocation(aggregate_resource_allocation) new_status = JobTracker._to_openeo_status( state, final_state) registry.patch( job_id, user_id, status=new_status, started=JobTracker. _to_serializable_datetime(start_time), finished=JobTracker. _to_serializable_datetime(finish_time), memory_time_megabyte_seconds= memory_time_megabyte_seconds, cpu_time_seconds=cpu_time_seconds) if current_status != new_status: _log.info( "changed job %s status from %s to %s" % (job_id, current_status, new_status), extra={'job_id': job_id}) if final_state != "UNDEFINED": result_metadata = self._batch_jobs.get_results_metadata( job_id, user_id) # TODO: skip patching the job znode and read from this file directly? registry.patch(job_id, user_id, **result_metadata) if new_status == 'finished': registry.remove_dependencies( job_id, user_id) dependency_sources = JobRegistry.get_dependency_sources( job_info) if dependency_sources: async_task.schedule_delete_batch_process_dependency_sources( job_id, dependency_sources) registry.mark_done(job_id, user_id) _log.info("marked %s as done" % job_id, extra={ 'job_id': job_id, 'area': result_metadata.get('area'), 'unique_process_ids': result_metadata.get( 'unique_process_ids'), 'cpu_time_seconds': cpu_time_seconds }) except JobTracker._UnknownApplicationIdException: registry.mark_done(job_id, user_id) except Exception: _log.warning( "resuming with remaining jobs after failing to handle batch job {j}:\n{e}" .format(j=job_id, e=traceback.format_exc()), extra={'job_id': job_id}) registry.set_status(job_id, user_id, 'error') registry.mark_done(job_id, user_id)
if __name__ == '__main__': import argparse logging.basicConfig(level=logging.INFO) openeogeotrellis.backend.logger.setLevel(logging.DEBUG) handler = logging.StreamHandler(stream=sys.stdout) handler.formatter = JsonFormatter( "%(asctime)s %(name)s %(levelname)s %(message)s", datefmt="%Y-%m-%dT%H:%M:%S%z") root_logger = logging.getLogger() root_logger.addHandler(handler) _log.info("ConfigParams(): {c}".format(c=ConfigParams())) parser = argparse.ArgumentParser( usage="OpenEO JobTracker", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--principal", default="*****@*****.**", help="Principal to be used to login to KDC") parser.add_argument( "--keytab", default="openeo-deploy/mep/openeo.keytab", help= "The full path to the file that contains the keytab for the principal") args = parser.parse_args() try:
def creodias( self, projected_polygons, from_date: str, to_date: str, collection_id: str = "Sentinel1", correlation_id: str = "NA", sar_backscatter_arguments: SarBackscatterArgs = SarBackscatterArgs( ), bands=None, zoom=0, # TODO: what to do with zoom? It is not used at the moment. result_dtype="float32", extra_properties={}) -> Dict[int, geopyspark.TiledRasterLayer]: """ Implementation of S1 backscatter calculation with Orfeo in Creodias environment """ # Initial argument checking bands = bands or ["VH", "VV"] sar_calibration_lut = self._get_sar_calibration_lut( sar_backscatter_arguments.coefficient) if sar_backscatter_arguments.mask: raise FeatureUnsupportedException( "sar_backscatter: mask band is not supported") if sar_backscatter_arguments.contributing_area: raise FeatureUnsupportedException( "sar_backscatter: contributing_area band is not supported") if sar_backscatter_arguments.local_incidence_angle: raise FeatureUnsupportedException( "sar_backscatter: local_incidence_angle band is not supported") if sar_backscatter_arguments.ellipsoid_incidence_angle: raise FeatureUnsupportedException( "sar_backscatter: ellipsoid_incidence_angle band is not supported" ) # Tile size to use in the TiledRasterLayer. tile_size = sar_backscatter_arguments.options.get( "tile_size", self._DEFAULT_TILE_SIZE) orfeo_memory = sar_backscatter_arguments.options.get("otb_memory", 256) # Geoid for orthorectification: get from options, fallback on config. elev_geoid = sar_backscatter_arguments.options.get( "elev_geoid") or ConfigParams().s1backscatter_elev_geoid elev_default = sar_backscatter_arguments.options.get("elev_default") logger.info(f"elev_geoid: {elev_geoid!r}") noise_removal = bool(sar_backscatter_arguments.noise_removal) debug_mode = smart_bool(sar_backscatter_arguments.options.get("debug")) feature_pyrdd, layer_metadata_py = self._build_feature_rdd( collection_id=collection_id, projected_polygons=projected_polygons, from_date=from_date, to_date=to_date, extra_properties=extra_properties, tile_size=tile_size, zoom=zoom, correlation_id=correlation_id) if debug_mode: self._debug_show_rdd_info(feature_pyrdd) # Group multiple tiles by product id def process_feature(feature: dict) -> Tuple[str, dict]: creo_path = feature["feature"]["id"] return creo_path, { "key": feature["key"], "key_extent": feature["key_extent"], "bbox": feature["feature"]["bbox"], "key_epsg": feature["metadata"]["crs_epsg"] } per_product = feature_pyrdd.map( process_feature).groupByKey().mapValues(list) # TODO: still split if full layout extent is too large for processing as a whole? # Apply Orfeo processing over product files as whole and splice up in tiles after that @epsel.ensure_info_logging @TimingLogger(title="process_product", logger=logger) def process_product(product: Tuple[str, List[dict]]): import faulthandler faulthandler.enable() creo_path, features = product # Short ad-hoc product id for logging purposes. prod_id = re.sub(r"[^A-Z0-9]", "", creo_path.upper())[-10:] log_prefix = f"p{os.getpid()}-prod{prod_id}" logger.info(f"{log_prefix} creo path {creo_path}") logger.info( f"{log_prefix} sar_backscatter_arguments: {sar_backscatter_arguments!r}" ) creo_path = pathlib.Path(creo_path) if not creo_path.exists(): raise OpenEOApiException("Creo path does not exist") # Get whole extent of tile layout col_min = min(f["key"]["col"] for f in features) col_max = max(f["key"]["col"] for f in features) cols = col_max - col_min + 1 row_min = min(f["key"]["row"] for f in features) row_max = max(f["key"]["row"] for f in features) rows = row_max - row_min + 1 instants = set(f["key"]["instant"] for f in features) assert len(instants) == 1, f"Not single instant: {instants}" instant = instants.pop() logger.info( f"{log_prefix} Layout key extent: col[{col_min}:{col_max}] row[{row_min}:{row_max}]" f" ({cols}x{rows}={cols * rows} tiles) instant[{instant}].") layout_extent = get_total_extent(features) key_epsgs = set(f["key_epsg"] for f in features) assert len(key_epsgs) == 1, f"Multiple key CRSs {key_epsgs}" layout_epsg = key_epsgs.pop() layout_width_px = tile_size * (col_max - col_min + 1) layout_height_px = tile_size * (row_max - row_min + 1) logger.info( f"{log_prefix} Layout extent {layout_extent} EPSG {layout_epsg}:" f" {layout_width_px}x{layout_height_px}px") band_tiffs = S1BackscatterOrfeo._creo_scan_for_band_tiffs( creo_path, log_prefix) dem_dir_context = S1BackscatterOrfeo._get_dem_dir_context( sar_backscatter_arguments=sar_backscatter_arguments, extent=layout_extent, epsg=layout_epsg) msg = f"{log_prefix} Process {creo_path} " with TimingLogger(title=msg, logger=logger), dem_dir_context as dem_dir: # Allocate numpy array tile orfeo_bands = numpy.zeros( (len(bands), layout_height_px, layout_width_px), dtype=result_dtype) for b, band in enumerate(bands): if band.lower() not in band_tiffs: raise OpenEOApiException(f"No tiff for band {band}") data, nodata = S1BackscatterOrfeoV2._orfeo_pipeline( input_tiff=band_tiffs[band.lower()], extent=layout_extent, extent_epsg=layout_epsg, dem_dir=dem_dir, extent_width_px=layout_width_px, extent_height_px=layout_height_px, sar_calibration_lut=sar_calibration_lut, noise_removal=noise_removal, elev_geoid=elev_geoid, elev_default=elev_default, log_prefix=f"{log_prefix}-{band}", orfeo_memory=orfeo_memory) orfeo_bands[b] = data if sar_backscatter_arguments.options.get("to_db", False): # TODO: keep this "to_db" shortcut feature or drop it # and require user to use standard openEO functionality (`apply` based conversion)? logger.info( f"{log_prefix} Converting backscatter intensity to decibel" ) orfeo_bands = 10 * numpy.log10(orfeo_bands) # Split orfeo output in tiles logger.info( f"{log_prefix} Split {orfeo_bands.shape} in tiles of {tile_size}" ) cell_type = geopyspark.CellType(orfeo_bands.dtype.name) tiles = [] for c in range(col_max - col_min + 1): for r in range(row_max - row_min + 1): col = col_min + c row = row_min + r key = geopyspark.SpaceTimeKey( col=col, row=row, instant=_instant_ms_to_day(instant)) tile = orfeo_bands[:, r * tile_size:(r + 1) * tile_size, c * tile_size:(c + 1) * tile_size] if not (tile == nodata).all(): logger.info( f"{log_prefix} Create Tile for key {key} from {tile.shape}" ) tile = geopyspark.Tile(tile, cell_type, no_data_value=nodata) tiles.append((key, tile)) logger.info( f"{log_prefix} Layout extent split in {len(tiles)} tiles") return tiles paths = list(per_product.keys().collect()) def partitionByPath(tuple): try: return paths.index(tuple) except Exception as e: hashPartitioner = pyspark.rdd.portable_hash return hashPartitioner(tuple) grouped = per_product.partitionBy(per_product.count(), partitionByPath) tile_rdd = grouped.flatMap(process_product) if result_dtype: layer_metadata_py.cell_type = result_dtype logger.info( "Constructing TiledRasterLayer from numpy rdd, with metadata {m!r}" .format(m=layer_metadata_py)) tile_layer = geopyspark.TiledRasterLayer.from_numpy_rdd( layer_type=geopyspark.LayerType.SPACETIME, numpy_rdd=tile_rdd, metadata=layer_metadata_py) return {zoom: tile_layer}
def _get_process_function(sar_backscatter_arguments, result_dtype, bands): # Tile size to use in the TiledRasterLayer. tile_size = sar_backscatter_arguments.options.get( "tile_size", S1BackscatterOrfeo._DEFAULT_TILE_SIZE) noise_removal = bool(sar_backscatter_arguments.noise_removal) # Geoid for orthorectification: get from options, fallback on config. elev_geoid = sar_backscatter_arguments.options.get( "elev_geoid") or ConfigParams().s1backscatter_elev_geoid elev_default = sar_backscatter_arguments.options.get("elev_default") logger.info(f"elev_geoid: {elev_geoid!r}") sar_calibration_lut = S1BackscatterOrfeo._get_sar_calibration_lut( sar_backscatter_arguments.coefficient) @epsel.ensure_info_logging @TimingLogger(title="process_feature", logger=logger) def process_feature(product: Tuple[str, List[dict]]): import faulthandler faulthandler.enable() creo_path, features = product prod_id = re.sub(r"[^A-Z0-9]", "", creo_path.upper())[-10:] log_prefix = f"p{os.getpid()}-prod{prod_id}" print(f"{log_prefix} creo path {creo_path}") logger.info( f"{log_prefix} sar_backscatter_arguments: {sar_backscatter_arguments!r}" ) layout_extent = get_total_extent(features) key_epsgs = set(f["key_epsg"] for f in features) assert len(key_epsgs) == 1, f"Multiple key CRSs {key_epsgs}" layout_epsg = key_epsgs.pop() dem_dir_context = S1BackscatterOrfeo._get_dem_dir_context( sar_backscatter_arguments=sar_backscatter_arguments, extent=layout_extent, epsg=layout_epsg) creo_path = pathlib.Path(creo_path) band_tiffs = S1BackscatterOrfeo._creo_scan_for_band_tiffs( creo_path, log_prefix) resultlist = [] with dem_dir_context as dem_dir: for feature in features: col, row, instant = (feature["key"][k] for k in ["col", "row", "instant"]) key_ext = feature["key_extent"] key_epsg = layout_epsg logger.info( f"{log_prefix} Feature creo path: {creo_path}, key {key_ext} (EPSG {key_epsg})" ) logger.info( f"{log_prefix} sar_backscatter_arguments: {sar_backscatter_arguments!r}" ) if not creo_path.exists(): raise OpenEOApiException("Creo path does not exist") msg = f"{log_prefix} Process {creo_path} and load into geopyspark Tile" with TimingLogger(title=msg, logger=logger): # Allocate numpy array tile tile_data = numpy.zeros( (len(bands), tile_size, tile_size), dtype=result_dtype) for b, band in enumerate(bands): if band.lower() not in band_tiffs: raise OpenEOApiException( f"No tiff for band {band}") data, nodata = S1BackscatterOrfeo._orfeo_pipeline( input_tiff=band_tiffs[band.lower()], extent=key_ext, extent_epsg=key_epsg, dem_dir=dem_dir, extent_width_px=tile_size, extent_height_px=tile_size, sar_calibration_lut=sar_calibration_lut, noise_removal=noise_removal, elev_geoid=elev_geoid, elev_default=elev_default, log_prefix=f"{log_prefix}-{band}") tile_data[b] = data if sar_backscatter_arguments.options.get( "to_db", False): # TODO: keep this "to_db" shortcut feature or drop it # and require user to use standard openEO functionality (`apply` based conversion)? logger.info( f"{log_prefix} Converting backscatter intensity to decibel" ) tile_data = 10 * numpy.log10(tile_data) key = geopyspark.SpaceTimeKey( row=row, col=col, instant=_instant_ms_to_day(instant)) cell_type = geopyspark.CellType(tile_data.dtype.name) logger.info( f"{log_prefix} Create Tile for key {key} from {tile_data.shape}" ) tile = geopyspark.Tile(tile_data, cell_type, no_data_value=nodata) resultlist.append((key, tile)) return resultlist return process_feature
def __init__( self, root_path: str = ConfigParams().batch_jobs_zookeeper_root_path, zookeeper_hosts: str = ','.join(ConfigParams().zookeepernodes)): self._root = root_path self._zk = KazooClient(hosts=zookeeper_hosts)
def __init__(self, zookeeper_hosts: str = ','.join( ConfigParams().zookeepernodes)): self._root = '/openeo/jobs' self._zk = KazooClient(hosts=zookeeper_hosts)
def run_job(job_specification, output_file: Path, metadata_file: Path, api_version, job_dir, dependencies: dict, user_id: str = None): logger.info(f"Job spec: {json.dumps(job_specification,indent=1)}") process_graph = job_specification['process_graph'] backend_implementation = GeoPySparkBackendImplementation() logger.info(f"Using backend implementation {backend_implementation}") correlation_id = str(uuid.uuid4()) logger.info(f"Correlation id: {correlation_id}") env = EvalEnv({ 'version': api_version or "1.0.0", 'pyramid_levels': 'highest', 'user': User(user_id=user_id), 'require_bounds': True, 'correlation_id': correlation_id, 'dependencies': dependencies, "backend_implementation": backend_implementation, }) tracer = DryRunDataTracer() logger.info("Starting process graph evaluation") result = ProcessGraphDeserializer.evaluate(process_graph, env=env, do_dry_run=tracer) logger.info("Evaluated process graph, result (type {t}): {r!r}".format( t=type(result), r=result)) if isinstance(result, DelayedVector): geojsons = (mapping(geometry) for geometry in result.geometries) result = JSONResult(geojsons) if isinstance(result, DriverDataCube): format_options = job_specification.get('output', {}) format_options["batch_mode"] = True result = ImageCollectionResult(cube=result, format='GTiff', options=format_options) if not isinstance(result, SaveResult): # Assume generic JSON result result = JSONResult(result) global_metadata_attributes = { "title": job_specification.get("title", ""), "description": job_specification.get("description", ""), "institution": "openEO platform - Geotrellis backend: " + __version__ } assets_metadata = None if ('write_assets' in dir(result)): result.options["batch_mode"] = True result.options["file_metadata"] = global_metadata_attributes if (result.options.get("sample_by_feature")): geoms = tracer.get_geometries("filter_spatial") if len(geoms) > 1: logger.warning( "Multiple aggregate_spatial geometries: {c}".format( c=len(geoms))) elif len(geoms) == 0: logger.warning( "sample_by_feature enabled, but no geometries found. They can be specified using filter_spatial." ) else: result.options["geometries"] = geoms[0] if (result.options["geometries"] == None): logger.error( "samply_by_feature was set, but no geometries provided through filter_spatial. Make sure to provide geometries." ) assets_metadata = result.write_assets(str(output_file)) for name, asset in assets_metadata.items(): _add_permissions(Path(asset["href"]), stat.S_IWGRP) logger.info("wrote image collection to %s" % output_file) elif isinstance(result, ImageCollectionResult): result.options["batch_mode"] = True result.save_result(filename=str(output_file)) _add_permissions(output_file, stat.S_IWGRP) logger.info("wrote image collection to %s" % output_file) elif isinstance(result, MultipleFilesResult): result.reduce(output_file, delete_originals=True) _add_permissions(output_file, stat.S_IWGRP) logger.info("reduced %d files to %s" % (len(result.files), output_file)) elif isinstance(result, NullResult): logger.info("skipping output file %s" % output_file) else: raise NotImplementedError( "unsupported result type {r}".format(r=type(result))) if any(card4l for _, card4l in dependencies.values()): # TODO: clean this up logger.debug("awaiting Sentinel Hub CARD4L data...") s3_service = get_jvm().org.openeo.geotrellissentinelhub.S3Service() poll_interval_secs = 10 max_delay_secs = 600 card4l_dependencies = [ (collection_id, source_location) for (collection_id, metadata_properties), (source_location, card4l) in dependencies.items() if card4l ] for collection_id, source_location in card4l_dependencies: uri_parts = urlparse(source_location) bucket_name = uri_parts.hostname request_group_id = uri_parts.path[1:] try: # FIXME: incorporate collection_id and metadata_properties to make sure the files don't clash s3_service.download_stac_data(bucket_name, request_group_id, str(job_dir), poll_interval_secs, max_delay_secs) logger.info("downloaded CARD4L data in {b}/{g} to {d}".format( b=bucket_name, g=request_group_id, d=job_dir)) except Py4JJavaError as e: java_exception = e.java_exception if (java_exception.getClass().getName( ) == 'org.openeo.geotrellissentinelhub.S3Service$StacMetadataUnavailableException' ): logger.warning( "could not find CARD4L metadata to download from s3://{b}/{r} after {d}s" .format(b=bucket_name, r=request_group_id, d=max_delay_secs)) else: raise e _transform_stac_metadata(job_dir) unique_process_ids = CollectUniqueProcessIdsVisitor().accept_process_graph( process_graph).process_ids _export_result_metadata(tracer=tracer, result=result, output_file=output_file, metadata_file=metadata_file, unique_process_ids=unique_process_ids, asset_metadata=assets_metadata) if ConfigParams().is_kube_deploy: import boto3 from openeogeotrellis.utils import s3_client bucket = os.environ.get('SWIFT_BUCKET') s3_instance = s3_client() logger.info("Writing results to object storage") for file in os.listdir(job_dir): full_path = str(job_dir) + "/" + file s3_instance.upload_file(full_path, bucket, full_path.strip("/"))
def get_layer_catalog(opensearch_enrich=False) -> GeoPySparkLayerCatalog: """ Get layer catalog (from JSON files) """ metadata: Dict[str, dict] = {} def read_catalog_file(catalog_file) -> Dict[str, dict]: return {coll["id"]: coll for coll in read_json(catalog_file)} catalog_files = ConfigParams().layer_catalog_metadata_files for path in catalog_files: logger.info(f"Reading layer catalog metadata from {path}") metadata = dict_merge_recursive(metadata, read_catalog_file(path), overwrite=True) if opensearch_enrich: opensearch_metadata = {} sh_collection_metadatas = None opensearch_instances = {} def opensearch_instance(endpoint: str) -> OpenSearch: endpoint = endpoint.lower() opensearch = opensearch_instances.get(os_endpoint) if opensearch is not None: return opensearch if "oscars" in endpoint or "terrascope" in endpoint or "vito.be" in endpoint: opensearch = OpenSearchOscars(endpoint=endpoint) elif "creodias" in endpoint: opensearch = OpenSearchCreodias(endpoint=endpoint) else: raise ValueError(endpoint) opensearch_instances[endpoint] = opensearch return opensearch for cid, collection_metadata in metadata.items(): data_source = deep_get(collection_metadata, "_vito", "data_source", default={}) os_cid = data_source.get("opensearch_collection_id") if os_cid: os_endpoint = data_source.get( "opensearch_endpoint") or ConfigParams( ).default_opensearch_endpoint logger.info( f"Updating {cid} metadata from {os_endpoint}:{os_cid}") try: opensearch_metadata[cid] = opensearch_instance( os_endpoint).get_metadata(collection_id=os_cid) except Exception: logger.warning(traceback.format_exc()) elif data_source.get("type") == "sentinel-hub": sh_cid = data_source.get("collection_id") if sh_cid is None: continue try: sh_stac_endpoint = "https://collections.eurodatacube.com/stac/index.json" if sh_collection_metadatas is None: sh_collections = requests.get(sh_stac_endpoint).json() sh_collection_metadatas = [ requests.get(c["link"]).json() for c in sh_collections ] sh_metadata = next( filter(lambda m: m["datasource_type"] == sh_cid, sh_collection_metadatas)) logger.info( f"Updating {cid} metadata from {sh_stac_endpoint}:{sh_metadata['id']}" ) opensearch_metadata[cid] = sh_metadata if not data_source.get("endpoint"): endpoint = opensearch_metadata[cid]["providers"][0][ "url"] endpoint = endpoint if endpoint.startswith( "http") else "https://{}".format(endpoint) data_source["endpoint"] = endpoint data_source["dataset_id"] = data_source.get( "dataset_id" ) or opensearch_metadata[cid]["datasource_type"] except StopIteration: logger.warning( f"No STAC data available for collection with id {sh_cid}" ) if opensearch_metadata: metadata = dict_merge_recursive(opensearch_metadata, metadata, overwrite=True) metadata = _merge_layers_with_common_name(metadata) return GeoPySparkLayerCatalog(all_metadata=list(metadata.values()))
def load_collection( self, collection_id: str, viewing_parameters: dict) -> 'GeotrellisTimeSeriesImageCollection': logger.info("Creating layer for {c} with viewingParameters {v}".format( c=collection_id, v=viewing_parameters)) # TODO is it necessary to do this kerberos stuff here? kerberos() metadata = CollectionMetadata( self.get_collection_metadata(collection_id, strip_private=False)) layer_source_info = metadata.get("_vito", "data_source", default={}) layer_source_type = layer_source_info.get("type", "Accumulo").lower() logger.info("Layer source type: {s!r}".format(s=layer_source_type)) import geopyspark as gps from_date = normalize_date(viewing_parameters.get("from", None)) to_date = normalize_date(viewing_parameters.get("to", None)) left = viewing_parameters.get("left", None) right = viewing_parameters.get("right", None) top = viewing_parameters.get("top", None) bottom = viewing_parameters.get("bottom", None) srs = viewing_parameters.get("srs", None) bands = viewing_parameters.get("bands", None) band_indices = [metadata.get_band_index(b) for b in bands] if bands else None logger.info("band_indices: {b!r}".format(b=band_indices)) # TODO: avoid this `still_needs_band_filter` ugliness. # Also see https://github.com/Open-EO/openeo-geopyspark-driver/issues/29 still_needs_band_filter = False pysc = gps.get_spark_context() extent = None gateway = JavaGateway( eager_load=True, gateway_parameters=pysc._gateway.gateway_parameters) jvm = gateway.jvm spatial_bounds_present = left is not None and right is not None and top is not None and bottom is not None if spatial_bounds_present: extent = jvm.geotrellis.vector.Extent(float(left), float(bottom), float(right), float(top)) elif ConfigParams().require_bounds: raise ProcessGraphComplexityException else: srs = "EPSG:4326" extent = jvm.geotrellis.vector.Extent(-180.0, -90.0, 180.0, 90.0) def accumulo_pyramid(): pyramidFactory = jvm.org.openeo.geotrellisaccumulo.PyramidFactory( "hdp-accumulo-instance", ','.join(ConfigParams().zookeepernodes)) if layer_source_info.get("split", False): pyramidFactory.setSplitRanges(True) accumulo_layer_name = layer_source_info['data_id'] nonlocal still_needs_band_filter still_needs_band_filter = bool(band_indices) return pyramidFactory.pyramid_seq(accumulo_layer_name, extent, srs, from_date, to_date) def s3_pyramid(): endpoint = layer_source_info['endpoint'] region = layer_source_info['region'] bucket_name = layer_source_info['bucket_name'] nonlocal still_needs_band_filter still_needs_band_filter = bool(band_indices) return jvm.org.openeo.geotrelliss3.PyramidFactory(endpoint, region, bucket_name) \ .pyramid_seq(extent, srs, from_date, to_date) def s3_jp2_pyramid(): endpoint = layer_source_info['endpoint'] region = layer_source_info['region'] return jvm.org.openeo.geotrelliss3.Jp2PyramidFactory(endpoint, region) \ .pyramid_seq(extent, srs, from_date, to_date, band_indices) def file_s2_radiometry_pyramid(): return jvm.org.openeo.geotrellis.file.Sentinel2RadiometryPyramidFactory() \ .pyramid_seq(extent, srs, from_date, to_date, band_indices) def file_s2_pyramid(): oscars_collection_id = layer_source_info['oscars_collection_id'] oscars_link_titles = metadata.band_names root_path = layer_source_info['root_path'] filtered_link_titles = [ oscars_link_titles[i] for i in band_indices ] if band_indices else oscars_link_titles return jvm.org.openeo.geotrellis.file.Sentinel2PyramidFactory( oscars_collection_id, filtered_link_titles, root_path).pyramid_seq(extent, srs, from_date, to_date) def geotiff_pyramid(): glob_pattern = layer_source_info['glob_pattern'] date_regex = layer_source_info['date_regex'] new_pyramid_factory = jvm.org.openeo.geotrellis.geotiff.PyramidFactory.from_disk( glob_pattern, date_regex) return self._geotiff_pyramid_factories.setdefault(collection_id, new_pyramid_factory) \ .pyramid_seq(extent, srs, from_date, to_date) def file_s1_coherence_pyramid(): return jvm.org.openeo.geotrellis.file.Sentinel1CoherencePyramidFactory() \ .pyramid_seq(extent, srs, from_date, to_date, band_indices) def sentinel_hub_s1_pyramid(): return jvm.org.openeo.geotrellissentinelhub.S1PyramidFactory(layer_source_info.get('uuid')) \ .pyramid_seq(extent, srs, from_date, to_date, band_indices) def sentinel_hub_s2_l1c_pyramid(): return jvm.org.openeo.geotrellissentinelhub.S2L1CPyramidFactory(layer_source_info.get('uuid')) \ .pyramid_seq(extent, srs, from_date, to_date, band_indices) def sentinel_hub_s2_l2a_pyramid(): return jvm.org.openeo.geotrellissentinelhub.S2L2APyramidFactory(layer_source_info.get('uuid')) \ .pyramid_seq(extent, srs, from_date, to_date, band_indices) def sentinel_hub_l8_pyramid(): return jvm.org.openeo.geotrellissentinelhub.L8PyramidFactory(layer_source_info.get('uuid')) \ .pyramid_seq(extent, srs, from_date, to_date, band_indices) logger.info("loading pyramid {s}".format(s=layer_source_type)) if layer_source_type == 's3': pyramid = s3_pyramid() elif layer_source_type == 's3-jp2': pyramid = s3_jp2_pyramid() elif layer_source_type == 'file-s2-radiometry': pyramid = file_s2_radiometry_pyramid() elif layer_source_type == 'file-s2': pyramid = file_s2_pyramid() elif layer_source_type == 'geotiff': pyramid = geotiff_pyramid() elif layer_source_type == 'file-s1-coherence': pyramid = file_s1_coherence_pyramid() elif layer_source_type == 'sentinel-hub-s1': pyramid = sentinel_hub_s1_pyramid() elif layer_source_type == 'sentinel-hub-s2-l1c': pyramid = sentinel_hub_s2_l1c_pyramid() elif layer_source_type == 'sentinel-hub-s2-l2a': pyramid = sentinel_hub_s2_l2a_pyramid() elif layer_source_type == 'sentinel-hub-l8': pyramid = sentinel_hub_l8_pyramid() else: pyramid = accumulo_pyramid() temporal_tiled_raster_layer = jvm.geopyspark.geotrellis.TemporalTiledRasterLayer option = jvm.scala.Option levels = { pyramid.apply(index)._1(): TiledRasterLayer( LayerType.SPACETIME, temporal_tiled_raster_layer( option.apply(pyramid.apply(index)._1()), pyramid.apply(index)._2())) for index in range(0, pyramid.size()) } image_collection = GeotrellisTimeSeriesImageCollection( pyramid=gps.Pyramid(levels), service_registry=self._service_registry, metadata=metadata) if still_needs_band_filter: # TODO: avoid this `still_needs_band_filter` ugliness. # Also see https://github.com/Open-EO/openeo-geopyspark-driver/issues/29 image_collection = image_collection.band_filter(band_indices) return image_collection
def main(): import argparse logging.basicConfig(level=logging.INFO) openeogeotrellis.backend.logger.setLevel(logging.DEBUG) handler = logging.StreamHandler(stream=sys.stdout) handler.formatter = JsonFormatter("%(asctime)s %(name)s %(levelname)s %(message)s", datefmt="%Y-%m-%dT%H:%M:%S%z") root_logger = logging.getLogger() root_logger.addHandler(handler) _log.info("argv: {a!r}".format(a=sys.argv)) _log.info("ConfigParams(): {c}".format(c=ConfigParams())) # FIXME: there's no Java output because Py4J redirects the JVM's stdout/stderr to /dev/null unless JavaGateway's # redirect_stdout/redirect_stderr are set (EP-4018) try: parser = argparse.ArgumentParser(usage="OpenEO AsyncTask --task <task>", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--py4j-jarpath", default="venv/share/py4j/py4j0.10.7.jar", help='Path to the Py4J jar') parser.add_argument("--py4j-classpath", default="geotrellis-extensions-2.2.0-SNAPSHOT.jar", help='Classpath used to launch the Java Gateway') parser.add_argument("--principal", default="*****@*****.**", help="Principal to be used to login to KDC") parser.add_argument("--keytab", default="openeo-deploy/mep/openeo.keytab", help="The full path to the file that contains the keytab for the principal") parser.add_argument("--task", required=True, dest="task_json", help="The task description in JSON") args = parser.parse_args() task = json.loads(args.task_json) task_id = task['task_id'] if task_id not in [TASK_DELETE_BATCH_PROCESS_RESULTS, TASK_POLL_SENTINELHUB_BATCH_PROCESSES, TASK_DELETE_BATCH_PROCESS_DEPENDENCY_SOURCES]: raise ValueError(f'unsupported task_id "{task_id}"') arguments: dict = task.get('arguments', {}) def batch_jobs() -> GpsBatchJobs: java_opts = [ "-client", "-Dsoftware.amazon.awssdk.http.service.impl=software.amazon.awssdk.http.urlconnection.UrlConnectionSdkHttpService" ] java_gateway = JavaGateway.launch_gateway(jarpath=args.py4j_jarpath, classpath=args.py4j_classpath, javaopts=java_opts, die_on_exit=True) return GpsBatchJobs(get_layer_catalog(opensearch_enrich=True), java_gateway.jvm, args.principal, args.keytab) if task_id in [TASK_DELETE_BATCH_PROCESS_RESULTS, TASK_DELETE_BATCH_PROCESS_DEPENDENCY_SOURCES]: batch_job_id = arguments['batch_job_id'] dependency_sources = (arguments.get('dependency_sources') or [f"s3://{sentinel_hub.OG_BATCH_RESULTS_BUCKET}/{subfolder}" for subfolder in arguments['subfolders']]) _log.info(f"removing dependency sources {dependency_sources} for batch job {batch_job_id}...", extra={'job_id': batch_job_id}) batch_jobs().delete_batch_process_dependency_sources(job_id=batch_job_id, dependency_sources=dependency_sources, propagate_errors=True) elif task_id == TASK_POLL_SENTINELHUB_BATCH_PROCESSES: batch_job_id = arguments['batch_job_id'] user_id = arguments['user_id'] while True: time.sleep(SENTINEL_HUB_BATCH_PROCESSES_POLL_INTERVAL_S) with JobRegistry() as registry: job_info = registry.get_job(batch_job_id, user_id) if job_info.get('dependency_status') not in ['awaiting', "awaiting_retry"]: break else: try: batch_jobs().poll_sentinelhub_batch_processes(job_info) except Exception: # TODO: retry in Nifi? How to mark this job as 'error' then? _log.error("failed to handle polling batch processes for batch job {j}:\n{e}" .format(j=batch_job_id, e=traceback.format_exc()), extra={'job_id': batch_job_id}) with JobRegistry() as registry: registry.set_status(batch_job_id, user_id, 'error') registry.mark_done(batch_job_id, user_id) raise else: raise AssertionError(f'unexpected task_id "{task_id}"') except Exception as e: _log.error(e, exc_info=True) raise e
def main(argv: List[str]) -> None: logger.info("argv: {a!r}".format(a=argv)) logger.info("pid {p}; ppid {pp}; cwd {c}".format(p=os.getpid(), pp=os.getppid(), c=os.getcwd())) if len(argv) < 6: print( "usage: %s " "<job specification input file> <job directory> <results output file name> <user log file name> " "<metadata file name> [api version] [dependencies]" % argv[0], file=sys.stderr) exit(1) job_specification_file = argv[1] job_dir = Path(argv[2]) output_file = job_dir / argv[3] log_file = job_dir / argv[4] metadata_file = job_dir / argv[5] api_version = argv[6] if len(argv) >= 7 else None dependencies = _deserialize_dependencies(argv[7]) if len(argv) >= 8 else {} user_id = argv[8] if len(argv) >= 9 else None _create_job_dir(job_dir) _setup_user_logging(log_file) # Override default temp dir (under CWD). Original default temp dir `/tmp` might be cleaned up unexpectedly. temp_dir = Path(os.getcwd()) / "tmp" temp_dir.mkdir(parents=True, exist_ok=True) logger.info("Using temp dir {t}".format(t=temp_dir)) os.environ["TMPDIR"] = str(temp_dir) try: if ConfigParams().is_kube_deploy: from openeogeotrellis.utils import s3_client bucket = os.environ.get('SWIFT_BUCKET') s3_instance = s3_client() s3_instance.download_file(bucket, job_specification_file.strip("/"), job_specification_file) job_specification = _parse(job_specification_file) load_custom_processes() conf = (SparkConf().set( "spark.serializer", "org.apache.spark.serializer.KryoSerializer" ).set( key='spark.kryo.registrator', value='geopyspark.geotools.kryo.ExpandedKryoRegistrator' ).set( "spark.kryo.classesToRegister", "org.openeo.geotrellisaccumulo.SerializableConfiguration,ar.com.hjg.pngj.ImageInfo,ar.com.hjg.pngj.ImageLineInt,geotrellis.raster.RasterRegion$GridBoundsRasterRegion" )) with SparkContext(conf=conf) as sc: principal = sc.getConf().get("spark.yarn.principal") key_tab = sc.getConf().get("spark.yarn.keytab") kerberos(principal, key_tab) def run_driver(): run_job(job_specification=job_specification, output_file=output_file, metadata_file=metadata_file, api_version=api_version, job_dir=job_dir, dependencies=dependencies, user_id=user_id) if sc.getConf().get('spark.python.profile', 'false').lower() == 'true': # Including the driver in the profiling: a bit hacky solution but spark profiler api does not allow passing args&kwargs driver_profile = BasicProfiler(sc) driver_profile.profile(run_driver) # running the driver code and adding driver's profiling results as "RDD==-1" sc.profiler_collector.add_profiler(-1, driver_profile) # collect profiles into a zip file profile_dumps_dir = job_dir / 'profile_dumps' sc.dump_profiles(profile_dumps_dir) profile_zip = shutil.make_archive( base_name=str(profile_dumps_dir), format='gztar', root_dir=profile_dumps_dir) _add_permissions(Path(profile_zip), stat.S_IWGRP) shutil.rmtree( profile_dumps_dir, onerror=lambda func, path, exc_info: logger.warning( f"could not recursively delete {profile_dumps_dir}: {func} {path} failed", exc_info=exc_info)) logger.info("Saved profiling info to: " + profile_zip) else: run_driver() except Exception as e: logger.exception("error processing batch job") user_facing_logger.exception("error processing batch job") if "Container killed on request. Exit code is 143" in str(e): user_facing_logger.error( "Your batch job failed because workers used too much Python memory. The same task was attempted multiple times. Consider increasing executor-memoryOverhead or contact the developers to investigate." ) raise e