def lambda_handler(payload, context={}): logger.debug('Payload: %s' % json.dumps(payload)) # if this is batch, output to stdout if not hasattr(context, "invoked_function_arn"): logger.addHandler(logging.StreamHandler()) collections = payload.get('collections') index = payload.get('index', 'input_state') state = payload.get('state', 'FAILED') since = payload.get('since', None) limit = payload.get('limit', None) batch = payload.get('batch', False) catids = payload.get('catids', []) # if this is a lambda and batch is set if batch and hasattr(context, "invoked_function_arn"): submit_batch_job(payload, context.invoked_function_arn, name='rerun') return if len(catids) > 0: catalogs = Catalogs.from_catids(catids) logger.debug(f"Rerunning {len(catalogs)} catalogs") catids = catalogs.process(replace=True) logger.info(f"{len(catids)} catalogs rerun") return catids catalogs = Catalogs.from_statedb(collections, state, since, index, limit=limit) logger.info(f"Fetched {len(catalogs.catalogs)} catalogs") catids = catalogs.process(replace=True) logger.info(f"{len(catids)} catalogs rerun") return catids
def handler(event, context={}): # if this is batch, output to stdout if not hasattr(context, "invoked_function_arn"): logger.addHandler(logging.StreamHandler()) logger.debug('Event: %s' % json.dumps(event)) # parse input url = event.get('url') batch = event.get('batch', False) process = event['process'] if batch and hasattr(context, "invoked_function_arn"): submit_batch_job(event, context.invoked_function_arn, definition='lambda-as-batch', name='feed-stac-crawl') return cat = Catalog.from_file(url) for item in cat.get_all_items(): payload = { 'type': 'FeatureCollection', 'features': [item.to_dict()], 'process': process } SNS_CLIENT.publish(TopicArn=SNS_TOPIC, Message=json.dumps(payload))
def handler(payload, context={}): logger.debug('Payload: %s' % json.dumps(payload)) collections = payload.get('collections') index = payload.get('index', 'input_state') state = payload.get('state', 'FAILED') since = payload.get('since', None) limit = payload.get('limit', None) batch = payload.get('batch', False) process_update = payload.get('process_update', None) catid_batch = 5 # if this is a lambda and batch is set if batch and hasattr(context, "invoked_function_arn"): submit_batch_job(payload, context.invoked_function_arn, name='rerun') return items = statedb.get_items(collections, state, since, index, limit=limit) nitems = len(items) logger.debug(f"Rerunning {nitems} catalogs") catids = [] for i, item in enumerate(items): catids.append(item['catid']) if (i % catid_batch) == 0: submit(catids, process_update=process_update) catids = [] if (i % 1000) == 0: logger.debug(f"Queued {i} catalogs") if len(catids) > 0: submit(catids, process_update=process_update) return {"found": nitems}
def handler(event, context={}): logger.debug('Event: %s' % json.dumps(event)) url = event.get('url') params = event.get('search', {}) max_items_batch = event.get('max_items_batch', 15000) sleep = event.get('sleep', None) process = event.get('process', None) # search API search = Search(url=url, **params) logger.debug(f"Searching {url}") found = search.found() logger.debug(f"Total items found: {found}") if found <= MAX_ITEMS_REQUEST: return run(params, url, sleep=sleep, process=process) elif hasattr(context, "invoked_function_arn"): nbatches = int(found / max_items_batch) + 1 if nbatches == 1: submit_batch_job(event, context.invoked_function_arn, definition='lambda-as-batch') else: for request in split_request(params, nbatches): event['search'] = request submit_batch_job(event, context.invoked_function_arn, definition='lambda-as-batch') logger.info(f"Submitted {nbatches} batches") return else: run(params, url, sleep=sleep, process=process)
def lambda_handler(event, context={}): logger.debug('Event: %s' % json.dumps(event)) # if this is batch, output to stdout if not hasattr(context, "invoked_function_arn"): logger.addHandler(logging.StreamHandler()) # parse input #s3urls = event['s3urls'] #suffix = event.get('suffix', 'json') #credentials = event.get('credentials', {}) #requester_pays = credentials.pop('requester_pays', False) ###### url = event.get('url') params = event.get('search', {}) max_items_batch = event.get('max_items_batch', 15000) sleep = event.get('sleep', None) # search API search = Search(api_url=url, **params) logger.debug(f"Searching {url}") found = search.found() logger.debug(f"Total items found: {found}") if found <= MAX_ITEMS_REQUEST: return run(params, url, sleep=sleep) elif hasattr(context, "invoked_function_arn"): nbatches = int(found / max_items_batch) + 1 if nbatches == 1: submit_batch_job(event, context.invoked_function_arn, definition='lambda-as-batch') else: for request in split_request(params, nbatches): event['search'] = request submit_batch_job(event, context.invoked_function_arn, definition='lambda-as-batch') logger.info(f"Submitted {nbatches} batches") return else: run(params, url, sleep=sleep)
def submit_inventory_batch_jobs(inventory_url, lambda_arn, batch_size: int = 10, max_batches: int = -1): urls = [] n = 0 for url in s3().latest_inventory_files(inventory_url): urls.append(url) if (len(urls) % batch_size) == 0: submit_batch_job({'inventory_files': urls}, lambda_arn) urls = [] n += 1 if max_batches > 0 and n > max_batches: break if len(urls) > 0: submit_batch_job({'inventory_files': urls}, lambda_arn) n += 1 logger.info(f"Submitted {n} jobs") return n
def handler(payload, context={}): logger.info('Payload: %s' % json.dumps(payload)) # get payload variables inventory_url = payload.pop('inventory_url', None) batch_size = payload.pop('batch_size', 10) max_batches = payload.pop('max_batches', -1) # required payload variable process = payload.pop('process') s3session = s3() # get latest inventory manifest and spawn batches (this currently assumes being run as Lambda!) if inventory_url is not None: inventory_bucket = s3session.urlparse(inventory_url)['bucket'] # get manifest and schema manifest = s3session.latest_inventory_manifest(inventory_url) schema = manifest['fileSchema'] if schema.startswith('struct'): keys = [ str(key).strip().split(':')[0] for key in schema[7:-1].split(',') ] else: keys = [str(key).strip() for key in schema.split(',')] # get list of inventory files files = manifest.get('files') logger.info('Getting latest inventory (%s files) from %s' % (len(files), inventory_url)) submitted_urls = [] njobs = 0 for f in files: url = f"s3://{inventory_bucket}/{f['key']}" submitted_urls.append(url) if (len(submitted_urls) % batch_size) == 0: batch_payload = { 'inventory_files': submitted_urls, 'keys': keys, 'process': process } batch_payload.update(payload) submit_batch_job(batch_payload, context.invoked_function_arn, definition='lambda-as-batch', name='feed-s3-inventory') submitted_urls = [] njobs += 1 # stop if max batches reached (used for testing) if max_batches > 0 and njobs >= max_batches: break if len(submitted_urls) > 0: batch_payload = { 'inventory_files': submitted_urls, 'keys': keys, 'process': process } batch_payload.update(payload) submit_batch_job(batch_payload, context.invoked_function_arn, definition='lambda-as-batch', name='feed-s3-inventory') njobs += 1 logger.info(f"Submitted {njobs} batch jobs") return njobs # process inventory files (assumes this is batch!) inventory_files = payload.pop('inventory_files', None) keys = payload.pop('keys', None) base_url = payload.pop('base_url', None) # these are all required catids = [] if inventory_files and keys and process: # filter filenames logger.info(f"Parsing {len(inventory_files)} inventory files") for f in inventory_files: for url in read_inventory_file(f, keys, **payload): parts = s3session.urlparse(url) id = '-'.join(op.dirname(parts['key']).split('/')) # use extension without . for asset key ext = op.splitext(parts['key'])[-1].lstrip('.') if base_url is not None and url.startswith('s3://'): url = f"{base_url}/{parts['bucket']}/{parts['key']}" # TODO - determime input collection from url item = { 'type': 'Feature', 'id': id, 'collection': process['input_collections'][0], 'properties': {}, 'assets': { ext: { 'href': url } } } catalog = { 'type': 'FeatureCollection', 'features': [item], 'process': process } # feed to cirrus through SNS topic SNS_CLIENT.publish(TopicArn=SNS_TOPIC, Message=json.dumps(catalog)) if (len(catids) % 1000) == 0: logger.debug( f"Published {len(catids)} catalogs to {SNS_TOPIC}: {json.dumps(catalog)}" ) catids.append(item['id']) logger.info( f"Published {len(catids)} catalogs from {len(inventory_files)} inventory files" ) return catids