async def s3_head_object(url, s3, **kw): """Run head_object return Result or Error (Result, None) -- on success (None, error) -- on failure """ from botocore.exceptions import ClientError, BotoCoreError def unpack(url, rr): return SimpleNamespace( url=url, size=rr.get("ContentLength", 0), etag=rr.get("ETag", ""), last_modified=rr.get("LastModified"), expiration=rr.get("Expiration"), ) bucket, key = s3_url_parse(url) try: rr = await s3.head_object(Bucket=bucket, Key=key, **kw) except (ClientError, BotoCoreError) as e: return (None, e) return (unpack(url, rr), None)
async def _s3_find_via_cbk(url, cbk, s3, pred=None, glob=None): """ List all objects under certain path each s3 object is represented by a SimpleNamespace with attributes: - url - size - last_modified - etag """ pred = norm_predicate(pred=pred, glob=glob) bucket, prefix = s3_url_parse(url) if len(prefix) > 0 and not prefix.endswith('/'): prefix = prefix + '/' pp = s3.get_paginator('list_objects_v2') n_total, n = 0, 0 async for o in pp.paginate(Bucket=bucket, Prefix=prefix): for f in o.get('Contents', []): n_total += 1 f = s3_file_info(f, bucket) if pred is None or pred(f): n += 1 await cbk(f) return n_total, n
async def s3_dir(url, s3, pred=None, glob=None): """ List s3 "directory" without descending into sub directories. pred: predicate for file objects file_info -> True|False glob: glob pattern for files only Returns: (dirs, files) where dirs -- list of subdirectories in `s3://bucket/path/` format files -- list of objects with attributes: url, size, last_modified, etag """ bucket, prefix = s3_url_parse(url) pred = norm_predicate(pred=pred, glob=glob) if len(prefix) > 0 and not prefix.endswith('/'): prefix = prefix + '/' pp = s3.get_paginator('list_objects_v2') _dirs = [] _files = [] async for o in pp.paginate(Bucket=bucket, Prefix=prefix, Delimiter='/'): for d in o.get('CommonPrefixes', []): d = d.get('Prefix') _dirs.append('s3://{}/{}'.format(bucket, d)) for f in o.get('Contents', []): f = s3_file_info(f, bucket) if pred is None or pred(f): _files.append(f) return _dirs, _files
async def s3_dir_dir(url, depth, dst_q, s3): """ Find directories certain depth from the base, push them to the `dst_q` ``` s3://bucket/a |- b1 |- c1/... |- c2/... |- some_file.txt |- b2 |- c3/... ``` Given a bucket structure above, calling this function with - url s3://bucket/a/ - depth=1 will produce - s3://bucket/a/b1/ - s3://bucket/a/b2/ - depth=2 will produce - s3://bucket/a/b1/c1/ - s3://bucket/a/b1/c2/ - s3://bucket/a/b2/c3/ Any files are ignored. """ if not url.endswith('/'): url = url + '/' pp = s3.get_paginator('list_objects_v2') async def step(bucket, prefix, depth, work_q, dst_q): async for o in pp.paginate(Bucket=bucket, Prefix=prefix, Delimiter='/'): for d in o.get('CommonPrefixes', []): d = d.get('Prefix') if depth > 1: await work_q.put((d, depth - 1)) else: d = 's3://{}/{}'.format(bucket, d) await dst_q.put(d) bucket, prefix = s3_url_parse(url) work_q = asyncio.LifoQueue() work_q.put_nowait((prefix, depth)) while work_q.qsize() > 0: _dir, depth = work_q.get_nowait() await step(bucket, _dir, depth, work_q, dst_q)
async def s3_fetch_object(url, s3, range=None): """ returns object with On success: .url = url .data = bytes .last_modified -- last modified timestamp .range = None | (in,out) .error = None On failure: .url = url .data = None .last_modified = None .range = None | (in, out) .error = str| botocore.Exception class """ from botocore.exceptions import ClientError, BotoCoreError def result(data=None, last_modified=None, error=None): return SimpleNamespace(url=url, data=data, error=error, last_modified=last_modified, range=range) bucket, key = s3_url_parse(url) extra_args = {} if range is not None: try: extra_args['Range'] = s3_fmt_range(range) except Exception: return result(error='Bad range passed in: ' + str(range)) try: obj = await s3.get_object(Bucket=bucket, Key=key, **extra_args) stream = obj.get('Body', None) if stream is None: return result(error='Missing Body in response') async with stream: data = await stream.read() except (ClientError, BotoCoreError) as e: return result(error=e) except Exception as e: return result(error="Some Error: " + str(e)) last_modified = obj.get('LastModified', None) return result(data=data, last_modified=last_modified)
def dump_to_s3(self, url, creds=None, **kw): import boto3 from boto3.s3.transfer import TransferConfig from odc.aws import s3_url_parse assert self._mem is not None GB = 1 << 30 transfer_config = TransferConfig(multipart_threshold=5 * GB) bucket, key = s3_url_parse(url) creds_opts = ({} if creds is None else dict( aws_access_key_id=creds.access_key, aws_secret_access_key=creds.secret_key, aws_session_token=creds.token, )) s3 = boto3.client("s3", **creds_opts) return s3.upload_fileobj(self._mem, bucket, key, ExtraArgs=kw, Config=transfer_config)
async def s3_dir_dir(url, depth, dst_q, s3, pred=None): """Find directories certain depth from the base, push them to the `dst_q` ``` s3://bucket/a |- b1 |- c1/... |- c2/... |- some_file.txt |- b2 |- c3/... ``` Given a bucket structure above, calling this function with - url s3://bucket/a/ - depth=1 will produce - s3://bucket/a/b1/ - s3://bucket/a/b2/ - depth=2 will produce - s3://bucket/a/b1/c1/ - s3://bucket/a/b1/c2/ - s3://bucket/a/b2/c3/ Any files are ignored. If `pred` is supplied it is expected to be a `str -> bool` mapping, on input full path of the sub-directory is given (e.g `a/b1/`) starting from root, but not including bucket name. Sub-directory is only traversed further if predicate returns True. """ if not url.endswith('/'): url = url + '/' if depth == 0: await dst_q.put(url) return pp = s3.get_paginator('list_objects_v2') async def step(bucket, prefix, depth, work_q, dst_q): async for o in pp.paginate(Bucket=bucket, Prefix=prefix, Delimiter='/'): for d in o.get('CommonPrefixes', []): d = d.get('Prefix') if pred is not None and not pred(d): continue if depth > 1: await work_q.put((d, depth - 1)) else: d = 's3://{}/{}'.format(bucket, d) await dst_q.put(d) bucket, prefix = s3_url_parse(url) work_q = asyncio.LifoQueue() work_q.put_nowait((prefix, depth)) while work_q.qsize() > 0: _dir, depth = work_q.get_nowait() await step(bucket, _dir, depth, work_q, dst_q)
def s3_find_glob(glob_pattern: str, skip_check: bool = False, s3: Optional[S3Fetcher] = None, **kw) -> Iterator[Any]: """ Build generator from supplied S3 URI glob pattern Arguments: glob_pattern {str} -- Glob pattern to filter S3 Keys by skip_check {bool} -- Skip validity check for S3 Key Raises: ve: ValueError if the glob pattern cannot be parsed """ if s3 is None: s3 = S3Fetcher() def do_file_query(qq, pred, dirs_pred=None): for d in s3.dir_dir(qq.base, qq.depth, pred=dirs_pred, **kw): _, _files = s3.list_dir(d, **kw).result() for f in _files: if pred(f): yield f def do_file_query2(qq, dirs_pred=None): fname = qq.file stream = s3.dir_dir(qq.base, qq.depth, pred=dirs_pred, **kw) if skip_check: yield from (SimpleNamespace(url=d + fname) for d in stream) return stream = (s3.head_object(d + fname, **kw) for d in stream) for (f, _), _ in future_results(stream, 32): if f is not None: yield f def do_dir_query(qq, dirs_pred=None): return (SimpleNamespace(url=url) for url in s3.dir_dir(qq.base, qq.depth, pred=dirs_pred, **kw)) try: qq = parse_query(glob_pattern) except ValueError as ve: logging.error(f"URI glob-pattern not understood : {ve}") raise ve glob_or_file = qq.glob or qq.file if qq.depth is None and glob_or_file is None: stream = s3.find(qq.base, **kw) elif qq.depth is None or qq.depth < 0: if qq.glob: stream = s3.find(qq.base, glob=qq.glob, **kw) elif qq.file: postfix = "/" + qq.file stream = s3.find(qq.base, pred=lambda o: o.url.endswith(postfix), **kw) else: # fixed depth query _, prefix = s3_url_parse(glob_pattern) dirs_glob = prefix.split("/")[:-1] def dirs_pred(f): n = f.count("/") _glob = "/".join(dirs_glob[:n]) + "/" return fnmatch(f, _glob) if qq.glob is not None: pred = norm_predicate(glob=qq.glob) stream = do_file_query(qq, pred, dirs_pred=dirs_pred) elif qq.file is not None: stream = do_file_query2(qq, dirs_pred=dirs_pred) else: stream = do_dir_query(qq, dirs_pred=dirs_pred) return stream
def execute_task(self, task: AlchemistTask, dryrun: bool = False, sns_arn: str = None): log = _LOG.bind(task=task.dataset.id) log.info("Task commencing", task=task) # Make sure our task makes sense and store it if task.settings.specification.transform != self.transform_name: raise ValueError( "Task transform is different to the Alchemist transform") transform = self._transform_with_args(task) # Ensure output path exists, this should be fine for file or s3 paths s3_destination = None try: s3_bucket, s3_path = s3_url_parse(task.settings.output.location) s3_destination = True except ValueError: fs_destination = Path(task.settings.output.location) # Load and process data in a decimated array if dryrun: res_by_ten = self._native_resolution(task) * 10 data = self.dc.load( product=task.dataset.type.name, id=task.dataset.id, measurements=task.settings.specification.measurements, output_crs=task.dataset.crs, resolution=(-1 * res_by_ten, res_by_ten), ) else: data = native_load( task.dataset, measurements=task.settings.specification.measurements, dask_chunks=task.settings.processing.dask_chunks, basis=task.settings.specification.basis, ) data = data.rename(task.settings.specification.measurement_renames) log.info("Data loaded") output_data = transform.compute(data) if "time" in output_data.dims: output_data = output_data.squeeze("time") log.info("Prepared lazy transformation", output_data=output_data) output_data = output_data.compute() crs = data.attrs["crs"] del data log.info("Loaded and transformed") # Because"/env/lib/python3.6/site-packages/eodatasets3/images.py", line 489, in write_from_ndarray # raise TypeError("Datatype not supported: {dt}".format(dt=dtype)) # TODO: investigate if this is ok dtypes = set(str(v.dtype) for v in output_data.data_vars.values()) if "int8" in dtypes: log.info( "Found dtype=int8 in output data, converting to uint8 for geotiffs" ) output_data = output_data.astype("uint8", copy=False) if "crs" not in output_data.attrs: output_data.attrs["crs"] = crs uuid, _ = self._deterministic_uuid(task) temp_metadata_path = Path( tempfile.gettempdir()) / f"{task.dataset.id}.yaml" with DatasetAssembler( metadata_path=temp_metadata_path, naming_conventions=self.naming_convention, dataset_id=uuid, ) as dataset_assembler: if task.settings.output.reference_source_dataset: source_doc = _munge_dataset_to_eo3(task.dataset) dataset_assembler.add_source_dataset( source_doc, auto_inherit_properties=True, inherit_geometry=task.settings.output.inherit_geometry, classifier=task.settings.specification. override_product_family, ) # Copy in metadata and properties for k, v in task.settings.output.metadata.items(): setattr(dataset_assembler, k, v) for k, v in task.settings.output.properties.items(): dataset_assembler.properties[k] = v # Update the GSD dataset_assembler.properties["eo:gsd"] = self._native_resolution( task) dataset_assembler.processed = datetime.utcnow() dataset_assembler.note_software_version( "datacube-alchemist", "https://github.com/opendatacube/datacube-alchemist", __version__, ) # Software Version of Transformer version_url = self._get_transform_info() dataset_assembler.note_software_version( name=task.settings.specification.transform, url=version_url["url"], version=version_url["version"], ) # Write it all to a tempdir root, and then either shift or s3 sync it into place with tempfile.TemporaryDirectory() as temp_dir: # Set up a temporary directory dataset_assembler.collection_location = Path(temp_dir) # Dodgy hack! dataset_assembler._metadata_path = None # Write out the data dataset_assembler.write_measurements_odc_xarray( output_data, nodata=task.settings.output.nodata, **task.settings.output.write_data_settings, ) log.info("Finished writing measurements") # Write out the thumbnail _write_thumbnail(task, dataset_assembler) log.info("Wrote thumbnail") # Do all the deferred work from above dataset_id, metadata_path = dataset_assembler.done() log.info("Assembled dataset", metadata_path=metadata_path) # Write STAC, because it depends on this being .done() # Conveniently, this also checks that files are there! stac = None if task.settings.output.write_stac: stac = _write_stac(metadata_path, task, dataset_assembler) log.info("STAC file written") relative_path = dataset_assembler._dataset_location.relative_to( temp_dir) if s3_destination: s3_location = ( f"s3://{s3_bucket}/{s3_path.rstrip('/')}/{relative_path}" ) s3_command = [ "aws", "s3", "sync", "--only-show-errors", "--acl bucket-owner-full-control", str(dataset_assembler._dataset_location), s3_location, ] if not dryrun: log.info(f"Syncing files to {s3_location}") else: s3_command.append("--dryrun") log.warning("PRETENDING to sync files to S3", s3_location=s3_destination) log.info("Writing files to s3", location=s3_location) # log.debug("S3 command: ", command=s3_command) subprocess.run(" ".join(s3_command), shell=True, check=True) else: dest_directory = fs_destination / relative_path if not dryrun: log.info("Writing files to disk", location=dest_directory) if dest_directory.exists(): shutil.rmtree(dest_directory) shutil.copytree(dataset_assembler._dataset_location, dest_directory) else: log.warning( f"NOT moving data from {temp_dir} to {dest_directory}" ) log.info("Task complete") if stac is not None and sns_arn: if not dryrun: _stac_to_sns(sns_arn, stac) elif sns_arn: _LOG.error( "Not posting to SNS because there's no STAC to post") return dataset_id, metadata_path