def from_event(cls, event: Dict, **kwargs) -> ProcessPayload: """Parse a Cirrus event and return a ProcessPayload instance Args: event (Dict): An event from SNS, SQS, or containing an s3 URL to payload Returns: ProcessPayload: A ProcessPaylaod instance """ if 'Records' in event: records = [json.loads(r['body']) for r in event['Records']] # there should be only one assert (len(records) == 1) if 'Message' in records[0]: # SNS payload = json.loads(records[0]['Message']) else: # SQS payload = records[0] elif 'url' in event: payload = s3().read_json(event['url']) elif 'Parameters' in event and 'url' in event['Parameters']: # this is Batch, get the output payload url = event['Parameters']['url'].replace('.json', '_out.json') payload = s3().read_json(url) else: payload = event return cls(payload, **kwargs)
def submit_batch_job(payload, arn, queue='basic-ondemand', definition='geolambda-as-batch', name=None): # envvars STACK_PREFIX = getenv('CIRRUS_STACK') CATALOG_BUCKET = getenv('CIRRUS_CATALOG_BUCKET') if name is None: name = arn.split(':')[-1] # upload payload to s3 url = f"s3://{CATALOG_BUCKET}/batch/{uuid.uuid1()}.json" s3().upload_json(payload, url) kwargs = { 'jobName': name, 'jobQueue': f"{STACK_PREFIX}-{queue}", 'jobDefinition': f"{STACK_PREFIX}-{definition}", 'parameters': { 'lambda_function': arn, 'url': url }, 'containerOverrides': { 'vcpus': 1, 'memory': 512, } } logger.debug(f"Submitted batch job with payload {url}") response = batch_client.submit_job(**kwargs) logger.debug(f"Batch response: {response}")
def process(self) -> str: """Add this Catalog to Cirrus and start workflow Returns: str: Catalog ID """ assert (CATALOG_BUCKET) # start workflow try: # add input catalog to s3 url = f"s3://{CATALOG_BUCKET}/{self['id']}/input.json" s3().upload_json(self, url) logger.debug(f"Uploaded {url}") # invoke step function arn = os.getenv('BASE_WORKFLOW_ARN') + self['process']['workflow'] logger.info(f"Running {arn} on {self['id']}") exe_response = stepfunctions.start_execution( stateMachineArn=arn, input=json.dumps(self.get_payload())) logger.debug(f"Start execution response: {exe_response}") # create DynamoDB record - this will always overwrite any existing process resp = statedb.add_item(self, exe_response['executionArn']) logger.debug(f"Add state item response: {resp}") return self['id'] except Exception as err: msg = f"process: failed starting {self['id']} ({err})" logger.error(msg) logger.error(format_exc()) statedb.add_failed_item(self, msg) raise err
def from_payload(cls, payload: Dict, **kwargs) -> Catalogs: """Parse a Cirrus payload and return a Catalogs instance Args: payload (Dict): A payload from SNS, SQS, or containing an s3 URL to payload Returns: Catalogs: A Catalogs instance """ catalogs = [] if 'Records' in payload: for record in [json.loads(r['body']) for r in payload['Records']]: if 'Message' in record: # SNS cat = Catalog(json.loads(record['Message'])) catalogs.append(cat) else: # SQS catalogs.append(Catalog(record)) elif 'url' in payload: catalogs = [Catalog(s3().read_json(payload['url']))] elif 'Parameters' in payload and 'url' in payload['Parameters']: # this is Batch, get the output payload url = payload['Parameters']['url'].replace('.json', '_out.json') catalogs = [Catalog(s3().read_json(url))] else: catalogs = [Catalog(payload)] return cls(catalogs)
def s3stac_write(uri, txt): extra = { 'ContentType': 'application/json' } if uri.startswith('s3'): s3().upload_json(json.loads(txt), uri, extra=extra, public=PUBLIC_CATALOG) else: STAC_IO.default_write_text_method(uri, txt)
def test_upload_download(s3mock): url = 's3://%s/mytestfile' % BUCKET s3().upload(__file__, url, public=True) exists = s3().exists(url) assert (exists) path = os.path.join(testpath, 'test_s3/test_upload_download') fname = s3().download(url, path) assert (os.path.exists(fname)) assert (os.path.join(path, os.path.basename(url)) == fname) rmtree(path)
def get_aws_archive(cls, collection, direct_from_s3=False, **kwargs): """ Generator function returning the archive of Sentinel data on AWS Keyword arguments: prefix -- Process only files keys begining with this prefix start_date -- Process this date and after end_date -- Process this date and earlier Returns: Iterator of STAC Items using specified Transform object """ # get latest AWS inventory for this collection inventory_url = 's3://sentinel-inventory/%s/%s-inventory' % ( collection, collection) inventory = s3().latest_inventory(inventory_url, **kwargs, suffix=cls.collections[collection]) #import pdb; pdb.set_trace() # iterate through latest inventory from datetime import datetime for i, url in enumerate(inventory): if (i % 100) == 0: logger.info('%s records' % i) try: if direct_from_s3: logger.debug('Fetching initial metadata: %s' % url) metadata = s3().read_json(url, requester_pays=True) else: # use free endpoint to access file parts = s3().urlparse(url) _url = '%s/%s/%s' % (cls.FREE_URL, collection, parts['key']) logger.debug('Fetching initial metadata: %s' % _url) r = requests.get(_url, stream=True) metadata = json.loads(r.text) ''' fnames = [f"{base_url}/{a}" for a in md['filenameMap'].values() if 'annotation' in a and 'calibration' not in a] metadata = { 'id': md['id'], 'coordinates': md['footprint']['coordinates'], 'filenames': fnames } ''' # transform to STAC Item sentinel_scene = cls(collection, metadata) item = sentinel_scene.to_stac(base_url=url) yield item except Exception as err: logger.error('Error creating STAC Item from %s, Error: %s' % (url, err)) continue
def get_payload(self) -> Dict: """Get original payload for this Catalog Returns: Dict: Cirrus Input Catalog """ payload = json.dumps(self) if CATALOG_BUCKET and len(payload.encode('utf-8')) > 30000: assert (CATALOG_BUCKET) url = f"s3://{CATALOG_BUCKET}/payloads/{uuid.uuid1()}.json" s3().upload_json(self, url) return {'url': url} else: return dict(self)
def get_payload(self) -> Dict: """Get original payload for this ProcessPayload Returns: Dict: Cirrus Input ProcessPayload """ payload = json.dumps(self) if PAYLOAD_BUCKET and len(payload.encode('utf-8')) > 30000: assert (PAYLOAD_BUCKET) url = f"s3://{PAYLOAD_BUCKET}/payloads/{uuid.uuid1()}.json" s3().upload_json(self, url) return {'url': url} else: return dict(self)
def handler(payload, context): catalog = Catalog.from_payload(payload) logger = get_task_logger(f"{__name__}.pre-batch", catalog=catalog) url = f"s3://{CATALOG_BUCKET}/batch/{catalog['id']}/{uuid.uuid1()}.json" try: # copy payload to s3 s3().upload_json(catalog, url) logger.debug(f"Uploaded catalog to {url}") return {'url': url} except Exception as err: msg = f"pre-batch: failed pre processing batch job for ({err})" logger.error(msg, exc_info=True) raise Exception(msg) from err
def get_s3_session(bucket: str=None, s3url: str=None, **kwargs) -> s3: """Get boto3-utils s3 class for interacting with an s3 bucket. A secret will be looked for with the name `cirrus-creds-<bucket-name>`. If no secret is found the default session will be used Args: bucket (str, optional): Bucket name to access. Defaults to None. url (str, optional): The s3 URL to access. Defaults to None. Returns: s3: A boto3-utils s3 class """ if s3url: parts = s3.urlparse(s3url) bucket = parts['bucket'] if bucket and bucket in s3_sessions: return s3_sessions[bucket] # otherwise, create new session for this bucket creds = deepcopy(kwargs) try: # get credentials from AWS secret secret_name = f"cirrus-creds-{bucket}" _creds = secrets.get_secret(secret_name) creds.update(_creds) logger.debug(f"Using credentials for bucket {bucket}: {json.dumps(creds)}") except ClientError: logger.debug(f"Using default credentials for bucket {bucket}") requester_pays = creds.pop('requester_pays', False) session = boto3.Session(**creds) s3_sessions[bucket] = s3(session, requester_pays=requester_pays) return s3_sessions[bucket]
def setUp(self): client = s3() client.s3.create_bucket(Bucket=testbucket) client.s3.put_object(Body='test', Bucket=testbucket, Key=os.path.basename(__file__)) os.makedirs(testpath, exist_ok=True)
def from_statedb(cls, collections, state, since: str = None, index: str = 'input_state', limit=None) -> Catalogs: """Create Catalogs object from set of StateDB Items Args: collections (str): String of collections (input or output depending on `index`) state (str): The state (QUEUED, PROCESSING, COMPLETED, FAILED, INVALID) of StateDB Items to get since (str, optional): Get Items since this duration ago (e.g., 10m, 8h, 1w). Defaults to None. index (str, optional): 'input_state' or 'output_state' Defaults to 'input_state'. limit ([type], optional): Max number of Items to return. Defaults to None. Returns: Catalogs: Catalogs instance """ catalogs = [] items = statedb.get_items(collections, state, since, index, limit=limit) logger.debug(f"Retrieved {len(items)} total items from statedb") for item in items: cat = Catalog(s3().read_json(item['input_catalog'])) catalogs.append(cat) logger.debug(f"Retrieved {len(catalogs)} input catalogs") return cls(catalogs, state_items=items)
def get_root(root_url): cat_url = f"s3://{DATA_BUCKET}/catalog.json" logger.debug("Root catalog: %s", cat_url) cat = s3().read_json(cat_url) links = [] workflows = cat.get('cirrus', {}).get('workflows', {}) for col in workflows: for wf in workflows[col]: name = f"{col} - {wf}" link = create_link( urljoin( root_url, f"{col}/workflow-{wf}", ), name, 'child', ) links.append(link) links.insert(0, create_link(root_url, "home", "self")) links.append(create_link(cat_url, "STAC", "stac")) root = { "id": f"{cat['id']}-state-api", "description": f"{cat['description']} State API", "links": links } return root
def lambda_handler(event, context): payload = ProcessPayload.from_event(event) logger = get_task_logger("task.pre-batch", payload=payload) url = f"s3://{PAYLOAD_BUCKET}/batch/{payload['id']}/{uuid.uuid1()}.json" try: # copy payload to s3 s3().upload_json(payload, url) logger.debug(f"Uploaded payload to {url}") return {'url': url} except Exception as err: msg = f"pre-batch: failed pre processing batch job for ({err})" logger.error(msg, exc_info=True) raise Exception(msg) from err
def setUp(self): session = boto3.session.Session(region_name='us-east-1') client = s3(session) client.s3.create_bucket(Bucket=testbucket) client.s3.put_object(Body='test', Bucket=testbucket, Key=os.path.basename(__file__)) os.makedirs(testpath, exist_ok=True)
def lambda_handler(payload, context): logger.debug('Payload: %s' % json.dumps(payload)) catalog = Catalogs.from_payload(payload)[0] url = f"s3://{CATALOG_BUCKET}/batch/{catalog['id']}/{uuid.uuid1()}.json" try: # copy payload to s3 s3().upload_json(catalog, url) logger.debug(f"Uploaded {catalog['id']} to {url}") logger.info(f"Completed pre processing batch job for {catalog['id']}") return {'url': url} except Exception as err: msg = f"pre-batch: failed pre processing batch job for {catalog['id']} ({err})" logger.error(msg) logger.error(format_exc()) raise Exception(msg) from err
def read_inventory_file(fname, keys, prefix=None, suffix=None, start_date=None, end_date=None, datetime_regex=None, datetime_key='LastModifiedDate'): logger.debug('Reading inventory file %s', fname) filename = s3().download(fname, path='/tmp') ext = op.splitext(fname)[-1] if ext == ".gz": records = read_csv_inventory_file(filename, keys) elif ext == ".orc": records = read_orc_inventory_file(filename, keys) if datetime_regex is not None: regex = re.compile(datetime_regex) else: regex = None sdate = parse(start_date).date() if start_date else None edate = parse(end_date).date() if end_date else None def get_datetime(record): if regex is not None: m = regex.match(record['key']).groupdict() dt = datetime(int(m['Y']), int(m['m']), int(m['d'])) elif isinstance(record[datetime_key], datetime.datetime): dt = record[datetime_key] else: dt = datetime.strptime(record[datetime_key], "%Y-%m-%dT%H:%M:%S.%fZ") return dt.date() for record in records: if prefix is not None and not record['key'].startswith(prefix): continue if suffix is not None and not record['key'].endswith(suffix): continue if sdate is not None: dt = get_datetime(record) if dt < sdate: continue if edate is not None: dt = get_datetime(record) if dt > edate: continue # made it here without getting filtered out yield 's3://%s/%s' % (record['bucket'], record['key'])
def test_latest_inventory(): url = 's3://sentinel-inventory/sentinel-s1-l1c/sentinel-s1-l1c-inventory' suffix = 'productInfo.json' session = boto3.Session() _s3 = s3(session) for url in _s3.latest_inventory(url, suffix=suffix): # dt = datetime.strptime(f['LastModifiedDate'], "%Y-%m-%dT%H:%M:%S.%fZ") # hours = (datetime.today() - dt).seconds // 3600 # assert(hours < 24) assert (url.endswith(suffix)) break
def test_upload_item_assets(self): item = self.get_test_item() path_template = 's3://testbucket/${id}/test' assets = ['local'] new_item = transfer.upload_item_assets(item, assets=assets, path_template=path_template, s3_urls=True, region_name='us-west-2') for k in assets: assert (new_item['assets'][k]['href'].startswith('s3://')) assert (s3().exists(new_item['assets'][k]['href']))
def get_root_catalog() -> Dict: """Get Cirrus root catalog from s3 Returns: Dict: STAC root catalog """ if s3().exists(ROOT_URL): cat = Catalog.from_file(ROOT_URL) else: catid = DATA_BUCKET.split('-data-')[0] cat = Catalog(id=catid, description=DESCRIPTION) logger.debug(f"Fetched {cat.describe()}") return cat
def __call__(self) -> str: """Add this ProcessPayload to Cirrus and start workflow Returns: str: ProcessPayload ID """ assert (PAYLOAD_BUCKET) arn = os.getenv('CIRRUS_BASE_WORKFLOW_ARN') + self.process['workflow'] # start workflow try: # add input payload to s3 url = f"s3://{PAYLOAD_BUCKET}/{self['id']}/input.json" s3().upload_json(self, url) # create DynamoDB record - this overwrites existing states other than PROCESSING resp = statedb.claim_processing(self['id']) # invoke step function self.logger.debug(f"Running Step Function {arn}") exe_response = stepfunctions.start_execution( stateMachineArn=arn, input=json.dumps(self.get_payload())) # add execution to DynamoDB record resp = statedb.set_processing(self['id'], exe_response['executionArn']) return self['id'] except statedb.db.meta.client.exceptions.ConditionalCheckFailedException: self.logger.warning('Already in PROCESSING state') return None except Exception as err: msg = f"failed starting workflow ({err})" self.logger.exception(msg) statedb.set_failed(self['id'], msg) raise
def get_root_catalog(): """Get Cirrus root catalog from s3 Returns: Dict: STAC root catalog """ caturl = f"{ROOT_URL}/catalog.json" if s3().exists(caturl): cat = Catalog.from_file(caturl) else: catid = DATA_BUCKET.split('-data-')[0] cat = Catalog(id=catid, description=DESCRIPTION) cat.normalize_and_save(ROOT_URL, CatalogType.ABSOLUTE_PUBLISHED) logger.debug(f"Fetched {cat.describe()}") return cat
def cli(): args = parse_args(sys.argv[1:]) cmd = args.pop('command') # logging logging.basicConfig( stream=sys.stdout, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=args.pop('log') * 10) # quiet these loud loggers logging.getLogger("botocore").propagate = False logging.getLogger("s3transfer").propagate = False logging.getLogger("urllib3").propagate = False if cmd == 'local': with open(args['filename']) as f: payload = json.loads(f.read()) handler(payload, local=args['workdir']) if cmd == 'cirrus': # fetch input catalog catalog = s3().read_json(args['url']) catalog = handler(catalog) # upload return payload s3().upload_json(catalog, args["url"].replace('.json', '_out.json'))
def read_inventory_file(fname, keys, prefix=None, suffix=None, start_date=None, end_date=None, datetime_regex=None, datetime_key='LastModifiedDate'): logger.debug('Reading inventory file %s' % (fname)) if datetime_regex is not None: regex = re.compile(datetime_regex) else: regex = None sdate = parse(start_date).date() if start_date else None edate = parse(end_date).date() if end_date else None filename = s3().download(fname, path='/tmp') def get_datetime(record): if regex is not None: m = regex.match(record['Key']).groupdict() dt = datetime(int(m['Y']), int(m['m']), int(m['d'])) else: dt = datetime.strptime(record[datetime_key], "%Y-%m-%dT%H:%M:%S.%fZ") return dt.date() gz = gzip.open(filename, 'rb') for line in io.BufferedReader(gz): l = line.decode('utf-8').replace('"', '').replace('\n', '') record = {keys[i]: v for i, v in enumerate(l.split(','))} if prefix is not None and not record['Key'].startswith(prefix): continue if suffix is not None and not record['Key'].endswith(suffix): continue if sdate is not None: dt = get_datetime(record) if dt < sdate: continue if edate is not None: dt = get_datetime(record) if dt > edate: continue # made it here without getting filtered out yield 's3://%s/%s' % (record['Bucket'], record['Key']) gz.close()
def lambda_handler(payload, context): logger.debug('Payload: %s' % json.dumps(payload)) # catalog URL url = payload['Parameters']['url'].replace('.json', '_out.json') try: # copy payload from s3 catalog = s3().read_json(url) logger.info(f"Completed post processing batch job for {catalog['id']}") return catalog except Exception as err: msg = f"post-batch: failed post processing batch job for {url} ({err})" logger.error(msg) logger.error(format_exc()) raise Exception(msg) from err
def from_catids(cls, catids: List[str], **kwargs) -> Catalogs: """Create Catalogs from list of Catalog IDs Args: catids (List[str]): List of catalog IDs Returns: Catalogs: A Catalogs instance """ items = [ statedb.dbitem_to_item(statedb.get_dbitem(catid)) for catid in catids ] catalogs = [] for item in items: cat = Catalog(s3().read_json(item['input_catalog'])) catalogs.append(cat) logger.debug(f"Retrieved {len(catalogs)} from state db") return cls(catalogs, state_items=items)
def submit_inventory_batch_jobs(inventory_url, lambda_arn, batch_size: int = 10, max_batches: int = -1): urls = [] n = 0 for url in s3().latest_inventory_files(inventory_url): urls.append(url) if (len(urls) % batch_size) == 0: submit_batch_job({'inventory_files': urls}, lambda_arn) urls = [] n += 1 if max_batches > 0 and n > max_batches: break if len(urls) > 0: submit_batch_job({'inventory_files': urls}, lambda_arn) n += 1 logger.info(f"Submitted {n} jobs") return n
def from_payload_ids(cls, payload_ids: List[str], **kwargs) -> ProcessPayloads: """Create ProcessPayloads from list of Payload IDs Args: payload_ids (List[str]): List of Payload IDs Returns: ProcessPayloads: A ProcessPayloads instance """ items = [ statedb.dbitem_to_item(statedb.get_dbitem(payload_id)) for payload_id in payload_ids ] payloads = [] for item in items: payload = ProcessPayload(s3().read_json(item['payload'])) payloads.append(payload) logger.debug(f"Retrieved {len(payloads)} from state db") return cls(payloads, state_items=items)
def get_s3_session(bucket: str=None, s3url: str=None, **kwargs) -> s3: """Get boto3-utils s3 class for interacting with an s3 bucket. A secret will be looked for with the name `cirrus-creds-<bucket-name>`. If no secret is found the default session will be used Args: bucket (str, optional): Bucket name to access. Defaults to None. url (str, optional): The s3 URL to access. Defaults to None. Returns: s3: A boto3-utils s3 class """ if s3url: parts = s3.urlparse(s3url) bucket = parts['bucket'] if bucket and bucket in s3_sessions: return s3_sessions[bucket] # otherwise, create new session for this bucket creds = deepcopy(kwargs) try: # get credentials from AWS secret secret_name = f"cirrus-creds-{bucket}" _creds = secrets.get_secret(secret_name) creds.update(_creds) except ClientError as e: if e.response["Error"]["Code"] != "ResourceNotFoundException": # some other client error we cannot handle raise e logger.info(f"Secret not found, using default credentials: '{secret_name}'") requester_pays = creds.pop('requester_pays', False) session = boto3.Session(**creds) s3_sessions[bucket] = s3(session, requester_pays=requester_pays) return s3_sessions[bucket]