Ejemplo n.º 1
0
    def from_event(cls, event: Dict, **kwargs) -> ProcessPayload:
        """Parse a Cirrus event and return a ProcessPayload instance

        Args:
            event (Dict): An event from SNS, SQS, or containing an s3 URL to payload

        Returns:
            ProcessPayload: A ProcessPaylaod instance
        """
        if 'Records' in event:
            records = [json.loads(r['body']) for r in event['Records']]
            # there should be only one
            assert (len(records) == 1)
            if 'Message' in records[0]:
                # SNS
                payload = json.loads(records[0]['Message'])
            else:
                # SQS
                payload = records[0]
        elif 'url' in event:
            payload = s3().read_json(event['url'])
        elif 'Parameters' in event and 'url' in event['Parameters']:
            # this is Batch, get the output payload
            url = event['Parameters']['url'].replace('.json', '_out.json')
            payload = s3().read_json(url)
        else:
            payload = event
        return cls(payload, **kwargs)
Ejemplo n.º 2
0
def submit_batch_job(payload,
                     arn,
                     queue='basic-ondemand',
                     definition='geolambda-as-batch',
                     name=None):
    # envvars
    STACK_PREFIX = getenv('CIRRUS_STACK')
    CATALOG_BUCKET = getenv('CIRRUS_CATALOG_BUCKET')

    if name is None:
        name = arn.split(':')[-1]

    # upload payload to s3
    url = f"s3://{CATALOG_BUCKET}/batch/{uuid.uuid1()}.json"
    s3().upload_json(payload, url)
    kwargs = {
        'jobName': name,
        'jobQueue': f"{STACK_PREFIX}-{queue}",
        'jobDefinition': f"{STACK_PREFIX}-{definition}",
        'parameters': {
            'lambda_function': arn,
            'url': url
        },
        'containerOverrides': {
            'vcpus': 1,
            'memory': 512,
        }
    }
    logger.debug(f"Submitted batch job with payload {url}")
    response = batch_client.submit_job(**kwargs)
    logger.debug(f"Batch response: {response}")
Ejemplo n.º 3
0
    def process(self) -> str:
        """Add this Catalog to Cirrus and start workflow

        Returns:
            str: Catalog ID
        """
        assert (CATALOG_BUCKET)

        # start workflow
        try:
            # add input catalog to s3
            url = f"s3://{CATALOG_BUCKET}/{self['id']}/input.json"
            s3().upload_json(self, url)
            logger.debug(f"Uploaded {url}")

            # invoke step function
            arn = os.getenv('BASE_WORKFLOW_ARN') + self['process']['workflow']
            logger.info(f"Running {arn} on {self['id']}")
            exe_response = stepfunctions.start_execution(
                stateMachineArn=arn, input=json.dumps(self.get_payload()))
            logger.debug(f"Start execution response: {exe_response}")

            # create DynamoDB record - this will always overwrite any existing process
            resp = statedb.add_item(self, exe_response['executionArn'])
            logger.debug(f"Add state item response: {resp}")

            return self['id']
        except Exception as err:
            msg = f"process: failed starting {self['id']} ({err})"
            logger.error(msg)
            logger.error(format_exc())
            statedb.add_failed_item(self, msg)
            raise err
Ejemplo n.º 4
0
    def from_payload(cls, payload: Dict, **kwargs) -> Catalogs:
        """Parse a Cirrus payload and return a Catalogs instance

        Args:
            payload (Dict): A payload from SNS, SQS, or containing an s3 URL to payload

        Returns:
            Catalogs: A Catalogs instance
        """
        catalogs = []
        if 'Records' in payload:
            for record in [json.loads(r['body']) for r in payload['Records']]:
                if 'Message' in record:
                    # SNS
                    cat = Catalog(json.loads(record['Message']))
                    catalogs.append(cat)
                else:
                    # SQS
                    catalogs.append(Catalog(record))
        elif 'url' in payload:
            catalogs = [Catalog(s3().read_json(payload['url']))]
        elif 'Parameters' in payload and 'url' in payload['Parameters']:
            # this is Batch, get the output payload
            url = payload['Parameters']['url'].replace('.json', '_out.json')
            catalogs = [Catalog(s3().read_json(url))]
        else:
            catalogs = [Catalog(payload)]
        return cls(catalogs)
Ejemplo n.º 5
0
def s3stac_write(uri, txt):
    extra = {
        'ContentType': 'application/json'
    }
    if uri.startswith('s3'):
        s3().upload_json(json.loads(txt), uri, extra=extra, public=PUBLIC_CATALOG)
    else:
        STAC_IO.default_write_text_method(uri, txt)
Ejemplo n.º 6
0
def test_upload_download(s3mock):
    url = 's3://%s/mytestfile' % BUCKET
    s3().upload(__file__, url, public=True)
    exists = s3().exists(url)
    assert (exists)
    path = os.path.join(testpath, 'test_s3/test_upload_download')
    fname = s3().download(url, path)
    assert (os.path.exists(fname))
    assert (os.path.join(path, os.path.basename(url)) == fname)
    rmtree(path)
Ejemplo n.º 7
0
    def get_aws_archive(cls, collection, direct_from_s3=False, **kwargs):
        """ Generator function returning the archive of Sentinel data on AWS
        Keyword arguments:
        prefix -- Process only files keys begining with this prefix
        start_date -- Process this date and after
        end_date -- Process this date and earlier

        Returns:
        Iterator of STAC Items using specified Transform object
        """

        # get latest AWS inventory for this collection
        inventory_url = 's3://sentinel-inventory/%s/%s-inventory' % (
            collection, collection)
        inventory = s3().latest_inventory(inventory_url,
                                          **kwargs,
                                          suffix=cls.collections[collection])
        #import pdb; pdb.set_trace()
        # iterate through latest inventory
        from datetime import datetime
        for i, url in enumerate(inventory):
            if (i % 100) == 0:
                logger.info('%s records' % i)

            try:
                if direct_from_s3:
                    logger.debug('Fetching initial metadata: %s' % url)
                    metadata = s3().read_json(url, requester_pays=True)
                else:
                    # use free endpoint to access file
                    parts = s3().urlparse(url)
                    _url = '%s/%s/%s' % (cls.FREE_URL, collection,
                                         parts['key'])
                    logger.debug('Fetching initial metadata: %s' % _url)
                    r = requests.get(_url, stream=True)
                    metadata = json.loads(r.text)
                '''
                fnames = [f"{base_url}/{a}" for a in md['filenameMap'].values() if 'annotation' in a and 'calibration' not in a]
                metadata = {
                    'id': md['id'],
                    'coordinates': md['footprint']['coordinates'],
                    'filenames': fnames
                }
                '''
                # transform to STAC Item
                sentinel_scene = cls(collection, metadata)
                item = sentinel_scene.to_stac(base_url=url)
                yield item

            except Exception as err:
                logger.error('Error creating STAC Item from %s, Error: %s' %
                             (url, err))
                continue
Ejemplo n.º 8
0
    def get_payload(self) -> Dict:
        """Get original payload for this Catalog

        Returns:
            Dict: Cirrus Input Catalog
        """
        payload = json.dumps(self)
        if CATALOG_BUCKET and len(payload.encode('utf-8')) > 30000:
            assert (CATALOG_BUCKET)
            url = f"s3://{CATALOG_BUCKET}/payloads/{uuid.uuid1()}.json"
            s3().upload_json(self, url)
            return {'url': url}
        else:
            return dict(self)
Ejemplo n.º 9
0
    def get_payload(self) -> Dict:
        """Get original payload for this ProcessPayload

        Returns:
            Dict: Cirrus Input ProcessPayload
        """
        payload = json.dumps(self)
        if PAYLOAD_BUCKET and len(payload.encode('utf-8')) > 30000:
            assert (PAYLOAD_BUCKET)
            url = f"s3://{PAYLOAD_BUCKET}/payloads/{uuid.uuid1()}.json"
            s3().upload_json(self, url)
            return {'url': url}
        else:
            return dict(self)
Ejemplo n.º 10
0
def handler(payload, context):
    catalog = Catalog.from_payload(payload)
    logger = get_task_logger(f"{__name__}.pre-batch", catalog=catalog)

    url = f"s3://{CATALOG_BUCKET}/batch/{catalog['id']}/{uuid.uuid1()}.json"

    try:
        # copy payload to s3
        s3().upload_json(catalog, url)

        logger.debug(f"Uploaded catalog to {url}")
        return {'url': url}
    except Exception as err:
        msg = f"pre-batch: failed pre processing batch job for ({err})"
        logger.error(msg, exc_info=True)
        raise Exception(msg) from err
Ejemplo n.º 11
0
def get_s3_session(bucket: str=None, s3url: str=None, **kwargs) -> s3:
    """Get boto3-utils s3 class for interacting with an s3 bucket. A secret will be looked for with the name
    `cirrus-creds-<bucket-name>`. If no secret is found the default session will be used

    Args:
        bucket (str, optional): Bucket name to access. Defaults to None.
        url (str, optional): The s3 URL to access. Defaults to None.

    Returns:
        s3: A boto3-utils s3 class
    """
    if s3url:
        parts = s3.urlparse(s3url)
        bucket = parts['bucket']

    if bucket and bucket in s3_sessions:
        return s3_sessions[bucket]
    # otherwise, create new session for this bucket
    creds = deepcopy(kwargs)
    
    try:
        # get credentials from AWS secret
        secret_name = f"cirrus-creds-{bucket}"
        _creds = secrets.get_secret(secret_name)
        creds.update(_creds)
        logger.debug(f"Using credentials for bucket {bucket}: {json.dumps(creds)}")
    except ClientError:
        logger.debug(f"Using default credentials for bucket {bucket}")

    requester_pays = creds.pop('requester_pays', False)
    session = boto3.Session(**creds)
    s3_sessions[bucket] = s3(session, requester_pays=requester_pays)
    return s3_sessions[bucket]
Ejemplo n.º 12
0
 def setUp(self):
     client = s3()
     client.s3.create_bucket(Bucket=testbucket)
     client.s3.put_object(Body='test',
                          Bucket=testbucket,
                          Key=os.path.basename(__file__))
     os.makedirs(testpath, exist_ok=True)
Ejemplo n.º 13
0
    def from_statedb(cls,
                     collections,
                     state,
                     since: str = None,
                     index: str = 'input_state',
                     limit=None) -> Catalogs:
        """Create Catalogs object from set of StateDB Items

        Args:
            collections (str): String of collections (input or output depending on `index`)
            state (str): The state (QUEUED, PROCESSING, COMPLETED, FAILED, INVALID) of StateDB Items to get
            since (str, optional): Get Items since this duration ago (e.g., 10m, 8h, 1w). Defaults to None.
            index (str, optional): 'input_state' or 'output_state' Defaults to 'input_state'.
            limit ([type], optional): Max number of Items to return. Defaults to None.

        Returns:
            Catalogs: Catalogs instance
        """
        catalogs = []
        items = statedb.get_items(collections,
                                  state,
                                  since,
                                  index,
                                  limit=limit)
        logger.debug(f"Retrieved {len(items)} total items from statedb")
        for item in items:
            cat = Catalog(s3().read_json(item['input_catalog']))
            catalogs.append(cat)
        logger.debug(f"Retrieved {len(catalogs)} input catalogs")
        return cls(catalogs, state_items=items)
Ejemplo n.º 14
0
def get_root(root_url):
    cat_url = f"s3://{DATA_BUCKET}/catalog.json"
    logger.debug("Root catalog: %s", cat_url)
    cat = s3().read_json(cat_url)

    links = []
    workflows = cat.get('cirrus', {}).get('workflows', {})
    for col in workflows:
        for wf in workflows[col]:
            name = f"{col} - {wf}"
            link = create_link(
                urljoin(
                    root_url,
                    f"{col}/workflow-{wf}",
                ),
                name,
                'child',
            )
            links.append(link)

    links.insert(0, create_link(root_url, "home", "self"))
    links.append(create_link(cat_url, "STAC", "stac"))

    root = {
        "id": f"{cat['id']}-state-api",
        "description": f"{cat['description']} State API",
        "links": links
    }

    return root
Ejemplo n.º 15
0
def lambda_handler(event, context):
    payload = ProcessPayload.from_event(event)
    logger = get_task_logger("task.pre-batch", payload=payload)

    url = f"s3://{PAYLOAD_BUCKET}/batch/{payload['id']}/{uuid.uuid1()}.json"

    try:
        # copy payload to s3
        s3().upload_json(payload, url)

        logger.debug(f"Uploaded payload to {url}")
        return {'url': url}
    except Exception as err:
        msg = f"pre-batch: failed pre processing batch job for ({err})"
        logger.error(msg, exc_info=True)
        raise Exception(msg) from err
Ejemplo n.º 16
0
 def setUp(self):
     session = boto3.session.Session(region_name='us-east-1')
     client = s3(session)
     client.s3.create_bucket(Bucket=testbucket)
     client.s3.put_object(Body='test',
                          Bucket=testbucket,
                          Key=os.path.basename(__file__))
     os.makedirs(testpath, exist_ok=True)
Ejemplo n.º 17
0
def lambda_handler(payload, context):
    logger.debug('Payload: %s' % json.dumps(payload))

    catalog = Catalogs.from_payload(payload)[0]

    url = f"s3://{CATALOG_BUCKET}/batch/{catalog['id']}/{uuid.uuid1()}.json"

    try:
        # copy payload to s3
        s3().upload_json(catalog, url)

        logger.debug(f"Uploaded {catalog['id']} to {url}")
        logger.info(f"Completed pre processing batch job for {catalog['id']}")
        return {'url': url}
    except Exception as err:
        msg = f"pre-batch: failed pre processing batch job for {catalog['id']} ({err})"
        logger.error(msg)
        logger.error(format_exc())
        raise Exception(msg) from err
Ejemplo n.º 18
0
def read_inventory_file(fname,
                        keys,
                        prefix=None,
                        suffix=None,
                        start_date=None,
                        end_date=None,
                        datetime_regex=None,
                        datetime_key='LastModifiedDate'):
    logger.debug('Reading inventory file %s', fname)
    filename = s3().download(fname, path='/tmp')
    ext = op.splitext(fname)[-1]
    if ext == ".gz":
        records = read_csv_inventory_file(filename, keys)
    elif ext == ".orc":
        records = read_orc_inventory_file(filename, keys)

    if datetime_regex is not None:
        regex = re.compile(datetime_regex)
    else:
        regex = None

    sdate = parse(start_date).date() if start_date else None
    edate = parse(end_date).date() if end_date else None

    def get_datetime(record):
        if regex is not None:
            m = regex.match(record['key']).groupdict()
            dt = datetime(int(m['Y']), int(m['m']), int(m['d']))
        elif isinstance(record[datetime_key], datetime.datetime):
            dt = record[datetime_key]
        else:
            dt = datetime.strptime(record[datetime_key],
                                   "%Y-%m-%dT%H:%M:%S.%fZ")
        return dt.date()

    for record in records:
        if prefix is not None and not record['key'].startswith(prefix):
            continue

        if suffix is not None and not record['key'].endswith(suffix):
            continue

        if sdate is not None:
            dt = get_datetime(record)
            if dt < sdate:
                continue

        if edate is not None:
            dt = get_datetime(record)
            if dt > edate:
                continue

        # made it here without getting filtered out
        yield 's3://%s/%s' % (record['bucket'], record['key'])
Ejemplo n.º 19
0
def test_latest_inventory():
    url = 's3://sentinel-inventory/sentinel-s1-l1c/sentinel-s1-l1c-inventory'
    suffix = 'productInfo.json'
    session = boto3.Session()
    _s3 = s3(session)
    for url in _s3.latest_inventory(url, suffix=suffix):
        # dt = datetime.strptime(f['LastModifiedDate'], "%Y-%m-%dT%H:%M:%S.%fZ")
        # hours = (datetime.today() - dt).seconds // 3600
        # assert(hours < 24)
        assert (url.endswith(suffix))
        break
Ejemplo n.º 20
0
 def test_upload_item_assets(self):
     item = self.get_test_item()
     path_template = 's3://testbucket/${id}/test'
     assets = ['local']
     new_item = transfer.upload_item_assets(item,
                                            assets=assets,
                                            path_template=path_template,
                                            s3_urls=True,
                                            region_name='us-west-2')
     for k in assets:
         assert (new_item['assets'][k]['href'].startswith('s3://'))
         assert (s3().exists(new_item['assets'][k]['href']))
Ejemplo n.º 21
0
def get_root_catalog() -> Dict:
    """Get Cirrus root catalog from s3

    Returns:
        Dict: STAC root catalog
    """
    if s3().exists(ROOT_URL):
        cat = Catalog.from_file(ROOT_URL)
    else:
        catid = DATA_BUCKET.split('-data-')[0]
        cat = Catalog(id=catid, description=DESCRIPTION)
    logger.debug(f"Fetched {cat.describe()}")
    return cat
Ejemplo n.º 22
0
    def __call__(self) -> str:
        """Add this ProcessPayload to Cirrus and start workflow

        Returns:
            str: ProcessPayload ID
        """
        assert (PAYLOAD_BUCKET)

        arn = os.getenv('CIRRUS_BASE_WORKFLOW_ARN') + self.process['workflow']

        # start workflow
        try:
            # add input payload to s3
            url = f"s3://{PAYLOAD_BUCKET}/{self['id']}/input.json"
            s3().upload_json(self, url)

            # create DynamoDB record - this overwrites existing states other than PROCESSING
            resp = statedb.claim_processing(self['id'])

            # invoke step function
            self.logger.debug(f"Running Step Function {arn}")
            exe_response = stepfunctions.start_execution(
                stateMachineArn=arn, input=json.dumps(self.get_payload()))

            # add execution to DynamoDB record
            resp = statedb.set_processing(self['id'],
                                          exe_response['executionArn'])

            return self['id']
        except statedb.db.meta.client.exceptions.ConditionalCheckFailedException:
            self.logger.warning('Already in PROCESSING state')
            return None
        except Exception as err:
            msg = f"failed starting workflow ({err})"
            self.logger.exception(msg)
            statedb.set_failed(self['id'], msg)
            raise
Ejemplo n.º 23
0
def get_root_catalog():
    """Get Cirrus root catalog from s3

    Returns:
        Dict: STAC root catalog
    """
    caturl = f"{ROOT_URL}/catalog.json"
    if s3().exists(caturl):
        cat = Catalog.from_file(caturl)
    else:
        catid = DATA_BUCKET.split('-data-')[0]
        cat = Catalog(id=catid, description=DESCRIPTION)
        cat.normalize_and_save(ROOT_URL, CatalogType.ABSOLUTE_PUBLISHED)
    logger.debug(f"Fetched {cat.describe()}")
    return cat
Ejemplo n.º 24
0
def cli():
    args = parse_args(sys.argv[1:])
    cmd = args.pop('command')

    # logging
    logging.basicConfig(
        stream=sys.stdout,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        level=args.pop('log') * 10)
    # quiet these loud loggers
    logging.getLogger("botocore").propagate = False
    logging.getLogger("s3transfer").propagate = False
    logging.getLogger("urllib3").propagate = False

    if cmd == 'local':
        with open(args['filename']) as f:
            payload = json.loads(f.read())
        handler(payload, local=args['workdir'])
    if cmd == 'cirrus':
        # fetch input catalog
        catalog = s3().read_json(args['url'])
        catalog = handler(catalog)
        # upload return payload
        s3().upload_json(catalog, args["url"].replace('.json', '_out.json'))
Ejemplo n.º 25
0
def read_inventory_file(fname, keys, prefix=None, suffix=None,
                        start_date=None, end_date=None,
                        datetime_regex=None, datetime_key='LastModifiedDate'):
    logger.debug('Reading inventory file %s' % (fname))
    
    if datetime_regex is not None:
        regex = re.compile(datetime_regex)
    else:
        regex = None

    sdate = parse(start_date).date() if start_date else None
    edate = parse(end_date).date() if end_date else None

    filename = s3().download(fname, path='/tmp')

    def get_datetime(record):
        if regex is not None:
            m = regex.match(record['Key']).groupdict()
            dt = datetime(int(m['Y']), int(m['m']), int(m['d']))
        else:
            dt = datetime.strptime(record[datetime_key], "%Y-%m-%dT%H:%M:%S.%fZ")
        return dt.date()

    gz = gzip.open(filename, 'rb')
    for line in io.BufferedReader(gz):
        l = line.decode('utf-8').replace('"', '').replace('\n', '')
        record = {keys[i]: v for i, v in enumerate(l.split(','))}

        if prefix is not None and not record['Key'].startswith(prefix):
            continue

        if suffix is not None and not record['Key'].endswith(suffix):
            continue

        if sdate is not None:
            dt = get_datetime(record)
            if dt < sdate:
                continue

        if edate is not None:
            dt = get_datetime(record)
            if dt > edate:
                continue        

        # made it here without getting filtered out
        yield 's3://%s/%s' % (record['Bucket'], record['Key'])

    gz.close()
Ejemplo n.º 26
0
def lambda_handler(payload, context):
    logger.debug('Payload: %s' % json.dumps(payload))

    # catalog URL
    url = payload['Parameters']['url'].replace('.json', '_out.json')

    try:
        # copy payload from s3
        catalog = s3().read_json(url)
        logger.info(f"Completed post processing batch job for {catalog['id']}")
        return catalog
    except Exception as err:
        msg = f"post-batch: failed post processing batch job for {url} ({err})"
        logger.error(msg)
        logger.error(format_exc())
        raise Exception(msg) from err
Ejemplo n.º 27
0
    def from_catids(cls, catids: List[str], **kwargs) -> Catalogs:
        """Create Catalogs from list of Catalog IDs

        Args:
            catids (List[str]): List of catalog IDs

        Returns:
            Catalogs: A Catalogs instance
        """
        items = [
            statedb.dbitem_to_item(statedb.get_dbitem(catid))
            for catid in catids
        ]
        catalogs = []
        for item in items:
            cat = Catalog(s3().read_json(item['input_catalog']))
            catalogs.append(cat)
        logger.debug(f"Retrieved {len(catalogs)} from state db")
        return cls(catalogs, state_items=items)
Ejemplo n.º 28
0
def submit_inventory_batch_jobs(inventory_url,
                                lambda_arn,
                                batch_size: int = 10,
                                max_batches: int = -1):
    urls = []
    n = 0
    for url in s3().latest_inventory_files(inventory_url):
        urls.append(url)
        if (len(urls) % batch_size) == 0:
            submit_batch_job({'inventory_files': urls}, lambda_arn)
            urls = []
            n += 1
            if max_batches > 0 and n > max_batches:
                break
    if len(urls) > 0:
        submit_batch_job({'inventory_files': urls}, lambda_arn)
        n += 1
    logger.info(f"Submitted {n} jobs")
    return n
Ejemplo n.º 29
0
    def from_payload_ids(cls, payload_ids: List[str],
                         **kwargs) -> ProcessPayloads:
        """Create ProcessPayloads from list of Payload IDs

        Args:
            payload_ids (List[str]): List of Payload IDs

        Returns:
            ProcessPayloads: A ProcessPayloads instance
        """
        items = [
            statedb.dbitem_to_item(statedb.get_dbitem(payload_id))
            for payload_id in payload_ids
        ]
        payloads = []
        for item in items:
            payload = ProcessPayload(s3().read_json(item['payload']))
            payloads.append(payload)
        logger.debug(f"Retrieved {len(payloads)} from state db")
        return cls(payloads, state_items=items)
Ejemplo n.º 30
0
def get_s3_session(bucket: str=None, s3url: str=None, **kwargs) -> s3:
    """Get boto3-utils s3 class for interacting with an s3 bucket. A secret will be looked for with the name
    `cirrus-creds-<bucket-name>`. If no secret is found the default session will be used

    Args:
        bucket (str, optional): Bucket name to access. Defaults to None.
        url (str, optional): The s3 URL to access. Defaults to None.

    Returns:
        s3: A boto3-utils s3 class
    """
    if s3url:
        parts = s3.urlparse(s3url)
        bucket = parts['bucket']

    if bucket and bucket in s3_sessions:
        return s3_sessions[bucket]
    # otherwise, create new session for this bucket
    creds = deepcopy(kwargs)

    try:
        # get credentials from AWS secret
        secret_name = f"cirrus-creds-{bucket}"
        _creds = secrets.get_secret(secret_name)
        creds.update(_creds)
    except ClientError as e:
        if e.response["Error"]["Code"] != "ResourceNotFoundException":
            # some other client error we cannot handle
            raise e
        logger.info(f"Secret not found, using default credentials: '{secret_name}'")


    requester_pays = creds.pop('requester_pays', False)
    session = boto3.Session(**creds)
    s3_sessions[bucket] = s3(session, requester_pays=requester_pays)
    return s3_sessions[bucket]