def hrefs_from_catalog(catalog: Catalog, N: int = None) -> Tuple[str, str]: def find_label_collection(c): return 'label' in str.lower(c.description) catalog.make_all_asset_hrefs_absolute() labels = next(filter(find_label_collection, catalog.get_children())) label_items = list(labels.get_items()) label_hrefs = [] imagery_hrefs = [] for item in label_items: try: imagery = item.get_links('source')[0] imagery.resolve_stac_object() imagery_href = imagery.target.assets.get('cog').href except: imagery_href = None label_href = pystac_workaround(item.assets.get('data').href) label_hrefs.append(label_href) if imagery_href.startswith('./'): imagery_href = re.sub('^\.\/[0-9]+-', './', imagery_href) imagery_href = imagery_href.replace('14060at01p00r17', '140603t01p00r17') imagery_hrefs.append(imagery_href) if N is not None: N = int(N) label_hrefs = [label_hrefs[N]] imagery_hrefs = [imagery_hrefs[N]] return (label_hrefs, imagery_hrefs)
def test_case_3(): root_cat = Catalog(id='test3', description='test case 3 catalog', title='test case 3 title') image_item = Item(id='imagery-item', geometry=RANDOM_GEOM, bbox=RANDOM_BBOX, datetime=datetime.utcnow(), properties={}) image_item.add_asset('ortho', Asset(href='some/geotiff.tiff', media_type=MediaType.GEOTIFF)) overviews = [LabelOverview('label', counts=[LabelCount('one', 1), LabelCount('two', 2)])] label_item = LabelItem(id='label-items', geometry=RANDOM_GEOM, bbox=RANDOM_BBOX, datetime=datetime.utcnow(), properties={}, label_description='ML Labels', label_type='vector', label_properties=['label'], label_classes=[LabelClasses(classes=['one', 'two'], name='label')], label_tasks=['classification'], label_methods=['manual'], label_overviews=overviews) label_item.add_source(image_item, assets=['ortho']) root_cat.add_item(image_item) root_cat.add_item(label_item) return root_cat
def test_items_with_no_input_source_raise_exceptions(self): catalog = Catalog('0', 'Catalog 0') catalog.add_item( Item('1', None, [0, 0, 1, 1], '2020-01-01T00:00:00.000Z', {})) adapter = AdapterTester(Message(full_message), catalog, config=self.config) self.assertRaises(RuntimeError, adapter.invoke)
def setUp(self): self.workdir = mkdtemp() self.inputdir = mkdtemp() self.catalog = Catalog('test-id', 'test catalog') self.catalog.normalize_and_save(self.inputdir, CatalogType.SELF_CONTAINED) self.config = config_fixture() print(self.config)
def test_full_copy_2(self): with TemporaryDirectory() as tmp_dir: cat = Catalog(id='test', description='test catalog') image_item = Item(id='Imagery', geometry=RANDOM_GEOM, bbox=RANDOM_BBOX, datetime=datetime.utcnow(), properties={}) for key in ['ortho', 'dsm']: image_item.add_asset( key, Asset(href='some/{}.tif'.format(key), media_type=MediaType.GEOTIFF)) label_item = LabelItem( id='Labels', geometry=RANDOM_GEOM, bbox=RANDOM_BBOX, datetime=datetime.utcnow(), properties={}, label_description='labels', label_type='vector', label_properties='label', label_classes=[LabelClasses(classes=['one', 'two'], name='label')], label_tasks=['classification']) label_item.add_source(image_item, assets=['ortho']) cat.add_items([image_item, label_item]) cat.normalize_hrefs(os.path.join(tmp_dir, 'catalog-full-copy-2-source')) cat.save(catalog_type=CatalogType.ABSOLUTE_PUBLISHED) cat2 = cat.full_copy() cat2.normalize_hrefs(os.path.join(tmp_dir, 'catalog-full-copy-2-dest')) cat2.save(catalog_type=CatalogType.ABSOLUTE_PUBLISHED) self.check_catalog(cat, 'source') self.check_catalog(cat2, 'dest')
def test_case_3() -> Catalog: root_cat = Catalog(id="test3", description="test case 3 catalog", title="test case 3 title") image_item = Item( id="imagery-item", geometry=ARBITRARY_GEOM, bbox=ARBITRARY_BBOX, datetime=datetime.utcnow(), properties={}, ) image_item.add_asset( "ortho", Asset(href="some/geotiff.tiff", media_type=MediaType.GEOTIFF)) overviews = [ LabelOverview.create( "label", counts=[ LabelCount.create("one", 1), LabelCount.create("two", 2) ], ) ] label_item = Item( id="label-items", geometry=ARBITRARY_GEOM, bbox=ARBITRARY_BBOX, datetime=datetime.utcnow(), properties={}, ) LabelExtension.add_to(label_item) label_ext = LabelExtension.ext(label_item) label_ext.apply( label_description="ML Labels", label_type=LabelType.VECTOR, label_properties=["label"], label_classes=[ LabelClasses.create(classes=["one", "two"], name="label") ], label_tasks=["classification"], label_methods=["manual"], label_overviews=overviews, ) label_ext.add_source(image_item, assets=["ortho"]) root_cat.add_item(image_item) root_cat.add_item(label_item) return root_cat
def validate_catalog(self, catalog: pystac.Catalog) -> int: catalog.validate() validated_count = 1 for child in catalog.get_children(): validated_count += self.validate_catalog(child) for item in catalog.get_items(): item.validate() validated_count += 1 return validated_count
def get_root_catalog() -> Dict: """Get Cirrus root catalog from s3 Returns: Dict: STAC root catalog """ if s3().exists(ROOT_URL): cat = Catalog.from_file(ROOT_URL) else: catid = DATA_BUCKET.split('-data-')[0] cat = Catalog(id=catid, description=DESCRIPTION) logger.debug(f"Fetched {cat.describe()}") return cat
def main(data_dir, input_references, store_username, store_apikey): if store_username is not None: os.environ['STAGEIN_USERNAME'] = store_username os.environ['STAGEIN_PASSWORD'] = store_apikey STAC_IO.read_text_method = my_read_method items = [] for input_reference in input_references: thing = pystac.read_file(input_reference) if isinstance(thing, pystac.item.Item): items.append(thing) elif isinstance(thing, pystac.catalog.Catalog): for item in thing.get_items(): items.append(item) # create catalog catalog = Catalog(id='catalog', description='staged STAC catalog') catalog.add_items(items) catalog.normalize_and_save(root_href=data_dir, catalog_type=CatalogType.RELATIVE_PUBLISHED) catalog.describe()
def test_determine_type_for_unknown(self): catalog = Catalog(id='test', description='test desc') subcat = Catalog(id='subcat', description='subcat desc') catalog.add_child(subcat) catalog.normalize_hrefs('http://example.com') d = catalog.to_dict(include_self_link=False) self.assertIsNone(CatalogType.determine_type(d))
def get_root_catalog(): """Get Cirrus root catalog from s3 Returns: Dict: STAC root catalog """ caturl = f"{ROOT_URL}/catalog.json" if s3().exists(caturl): cat = Catalog.from_file(caturl) else: catid = DATA_BUCKET.split('-data-')[0] cat = Catalog(id=catid, description=DESCRIPTION) cat.normalize_and_save(ROOT_URL, CatalogType.ABSOLUTE_PUBLISHED) logger.debug(f"Fetched {cat.describe()}") return cat
def test_map_assets_tup(self): changed_assets = [] def asset_mapper(key, asset): if 'geotiff' in asset.media_type: asset.title = 'NEW TITLE' changed_assets.append(key) return ('{}-modified'.format(key), asset) else: return asset with TemporaryDirectory() as tmp_dir: catalog = TestCases.test_case_2() new_cat = catalog.map_assets(asset_mapper) new_cat.normalize_hrefs(os.path.join(tmp_dir, 'cat')) new_cat.save(catalog_type=CatalogType.ABSOLUTE_PUBLISHED) result_cat = Catalog.from_file( os.path.join(tmp_dir, 'cat', 'catalog.json')) found = False not_found = False for item in result_cat.get_all_items(): for key, asset in item.assets.items(): if key.replace('-modified', '') in changed_assets: found = True self.assertEqual(asset.title, 'NEW TITLE') else: not_found = True self.assertNotEqual(asset.title, 'NEW TITLE') self.assertTrue(found) self.assertTrue(not_found)
def test_case_4(): """Test case that is based on a local copy of the Tier 1 dataset from DrivenData's OpenCities AI Challenge. See: https://www.drivendata.org/competitions/60/building-segmentation-disaster-resilience """ return Catalog.from_file( TestCases.get_path('data-files/catalogs/test-case-4/catalog.json'))
def move_all_assets(catalog: Catalog, asset_subdirectory: Optional[str] = None, make_hrefs_relative: bool = True, copy: bool = False, ignore_conflicts: bool = False) -> Catalog: """Moves assets in a catalog to be alongside the items that own them. Args: catalog (Catalog or Collection): The PySTAC Catalog or Collection to perform the asset transformation on. asset_subdirectory (str or None): A subdirectory that will be used to store the assets. If not supplied, the assets will be moved or copied to the same directory as their item. make_assets_relative (bool): If True, will make the asset HREFs relative to the assets. If false, the asset will be an absolute href. copy (bool): If False this function will move the asset file; if True, the asset file will be copied. ignore_conflicts (bool): If the asset destination file already exists, this function will throw an error unless ignore_conflicts is True. Returns: [Catalog or Collection]: Returns the updated catalog. This operation mutates the catalog. """ for item in catalog.get_all_items(): move_assets(item, asset_subdirectory, make_hrefs_relative, copy, ignore_conflicts) return catalog
def lambda_handler(event, context): logger.debug('Event: %s' % json.dumps(event)) root_cat = get_root_catalog() # check if collection and if so, add to Cirrus if 'extent' in event: # add to static catalog root_cat.add_child(event) # send to Cirrus Publish SNS response = snsclient.publish(TopicArn=PUBLISH_TOPIC, Message=json.dumps(event)) logger.debug(f"SNS Publish response: {json.dumps(response)}") # check if URL to catalog if 'catalog_url' in event: cat = Catalog.from_file(event['catalog_url']) for child in cat.get_children(): if isinstance(child, Collection): child.remove_links('child') link = Link('copied_from', child) child.add_link(link, child.get_self_href()) root_cat.add_child(child) child_json = json.dumps(child.to_dict()) logger.debug(f"Publishing {child.id}: {child_json}") response = snsclient.publish(TopicArn=PUBLISH_TOPIC, Message=child_json) logger.debug(f"SNS Publish response: {json.dumps(response)}") root_cat.normalize_and_save(ROOT_URL, CatalogType.ABSOLUTE_PUBLISHED)
def mirror_collections(url, path='', **kwargs): API_RELS = ['search', 'collections', 'next'] cat = Catalog.from_file(url) empty_children = [] total_items = 0 for provider in cat.get_children(): links = provider.get_child_links() if len(links): # remove API specific links [provider.remove_links(rel) for rel in API_RELS] # remove links from the collections for collection in provider.get_children(): found = hits( collection.get_single_link('items').get_href(), {'limit': 0}) [ collection.remove_links(rel) for rel in ['child', 'next', 'items'] ] logger.info( f"{provider.id} - {collection.id}: {found} Items found") total_items += found else: empty_children.append(provider) [cat.remove_child(c.id) for c in empty_children] logger.info(f"{total_items} total Items found") cat.catalog_type = CatalogType.RELATIVE_PUBLISHED return cat
def parse_stac(stac_uri: str) -> List[dict]: """Parse a STAC catalog JSON file to extract label URIs, images URIs, and AOIs. Note: This has been tested to be compatible with STAC version 1.0.0 but not any other versions. Args: stac_uri (str): Path to the STAC catalog JSON file. Returns: List[dict]: A lsit of dicts with keys: "label_uri", "image_uris", "label_bbox", "image_bbox", "bboxes_intersect", and "aoi_geometry". Each dict corresponds to one label item and its associated image assets in the STAC catalog. """ setup_stac_io() cat = Catalog.from_file(stac_uri) version: str = cat.to_dict()['stac_version'] if not version.startswith('1.0'): log.warning(f'Parsing is not guaranteed to work correctly for ' f'STAC version != 1.0.*. Found version: {version}.') cat.make_all_asset_hrefs_absolute() label_items = [item for item in cat.get_all_items() if is_label_item(item)] image_items = [get_linked_image_item(item) for item in label_items] if len(label_items) == 0: raise ValueError('Unable to find any label items in STAC catalog.') out = [] for label_item, image_item in zip(label_items, image_items): label_uri: str = list(label_item.assets.values())[0].href label_bbox = box(*label_item.bbox) aoi_geometry: Optional[dict] = label_item.geometry if image_item is not None: image_assets = [ asset for asset in image_item.get_assets().values() if 'image' in asset.media_type ] image_uris = [asset.href for asset in image_assets] image_bbox = box(*image_item.bbox) bboxes_intersect = label_bbox.intersects(image_bbox) else: image_uris = [] image_bbox = None bboxes_intersect = False out.append({ 'label_uri': label_uri, 'image_uris': image_uris, 'label_bbox': label_bbox, 'image_bbox': image_bbox, 'bboxes_intersect': bboxes_intersect, 'aoi_geometry': aoi_geometry }) return out
async def cli(): args = parse_args(sys.argv[1:]) logging.basicConfig( stream=sys.stdout, level=logging.DEBUG) #, format='%(asctime)-15s %(message)s') # quiet loggers for lg in [ 'httpx', 'urllib3', 'botocore', 'boto3', 'aioboto3', 'aiobotocore' ]: logging.getLogger(lg).propagate = False cmd = args.pop('command') if cmd == 'create': # create initial catalog through to collections cat = mirror_collections(args['url'], args['path']) cat.normalize_hrefs(args['path']) await cat.save() elif cmd == 'update': cat = Catalog.from_file(args['cat'], ) collection = cat.get_child(args['provider']).get_child( args['collection']) url = f"{args['url']}/{args['provider']}/collections/{args['collection']}/items" params = { 'limit': args['limit'], } if args['datetime'] is not None: params['datetime'] = args['datetime'] await mirror_items(collection, url, params, max_sync_queries=args['max_sync_queries'], item_template=args['item_template'])
def __init__(self, old_catalog_path: str = None, new_path: str = None) -> None: if old_catalog_path is None: if new_path is None: new_path = "" self.path: str = new_path self.root_catalog: Catalog = Catalog(id="GisSpot-root-catalog", title="GisSpot-root-catalog", description="Root catalog on GisSpot server") else: old_catalog_path = normalize_stac_path(old_catalog_path) print(old_catalog_path) stac_obj = read_file(old_catalog_path) if type(stac_obj) is Catalog: self.root_catalog: Catalog = stac_obj else: raise TypeError("old_catalog_path must be path to STAC catalog") if new_path is None: self.path: str = self.root_catalog.get_self_href() else: self.path: str = new_path
def handler(event, context={}): # if this is batch, output to stdout if not hasattr(context, "invoked_function_arn"): logger.addHandler(logging.StreamHandler()) logger.debug('Event: %s' % json.dumps(event)) # parse input url = event.get('url') batch = event.get('batch', False) process = event['process'] if batch and hasattr(context, "invoked_function_arn"): submit_batch_job(event, context.invoked_function_arn, definition='lambda-as-batch', name='feed-stac-crawl') return cat = Catalog.from_file(url) for item in cat.get_all_items(): payload = { 'type': 'FeatureCollection', 'features': [item.to_dict()], 'process': process } SNS_CLIENT.publish(TopicArn=SNS_TOPIC, Message=json.dumps(payload))
def parse_stac(stac_uri): setup_stac_s3() cat = Catalog.from_file(stac_uri) cat.make_all_asset_hrefs_absolute() labels_uri = None geotiff_uris = [] for item in cat.get_all_items(): if isinstance(item, LabelItem): labels_uri = list(item.assets.values())[0].href labels_box = box(*item.bbox) # only use geotiffs that intersect with bbox of labels for item in cat.get_all_items(): if not isinstance(item, LabelItem): geotiff_uri = list(item.assets.values())[0].href geotiff_box = box(*item.bbox) if labels_box.intersects(geotiff_box): geotiff_uri = geotiff_uri.replace('%7C', '|') geotiff_uris.append(geotiff_uri) if not labels_uri: raise ValueError('Unable to read labels URI from STAC.') if not geotiff_uris: raise ValueError('Unable to read GeoTIFF URIs from STAC.') return labels_uri, labels_box, geotiff_uris
def test_map_assets_single(self): changed_asset = 'd43bead8-e3f8-4c51-95d6-e24e750a402b' def asset_mapper(key, asset): if key == changed_asset: asset.title = 'NEW TITLE' return asset with TemporaryDirectory() as tmp_dir: catalog = TestCases.test_case_2() new_cat = catalog.map_assets(asset_mapper) new_cat.normalize_hrefs(os.path.join(tmp_dir, 'cat')) new_cat.save(catalog_type=CatalogType.ABSOLUTE_PUBLISHED) result_cat = Catalog.from_file( os.path.join(tmp_dir, 'cat', 'catalog.json')) found = False for item in result_cat.get_all_items(): for key, asset in item.assets.items(): if key == changed_asset: found = True self.assertEqual(asset.title, 'NEW TITLE') else: self.assertNotEqual(asset.title, 'NEW TITLE') self.assertTrue(found)
def test_read_remote(self): catalog_url = ('https://raw.githubusercontent.com/radiantearth/stac-spec/' 'v{}' '/extensions/label/examples/multidataset/catalog.json'.format(STAC_VERSION)) cat = Catalog.from_file(catalog_url) zanzibar = cat.get_child('zanzibar-collection') self.assertEqual(len(list(zanzibar.get_items())), 2)
def test_getattribute_overload(self): catalog = Catalog(id='test', description='test') self.assertEqual(ExtensionIndex.__name__, 'ExtensionIndex') self.assertRaises(ExtensionError, catalog.ext.__getattr__, 'foo') self.assertRaises(ExtensionError, catalog.ext.__getattr__, 'eo') catalog.ext.enable('single-file-stac') self.assertTrue( catalog.ext.__getattr__('single-file-stac'), pystac.extensions.single_file_stac.SingleFileSTACCatalogExt)
def test_read_remote(self): # TODO: Move this URL to the main stac-spec repo once the example JSON is fixed. catalog_url = ( 'https://raw.githubusercontent.com/lossyrob/stac-spec/0.9.0/pystac-upgrade-fixes' '/extensions/label/examples/multidataset/catalog.json') cat = Catalog.from_file(catalog_url) zanzibar = cat.get_child('zanzibar-collection') self.assertEqual(len(list(zanzibar.get_items())), 2)
def get_scenes(json_file: str, class_config: ClassConfig, class_id_filter_dict: dict, catalog_dir: str, imagery_dir: str, train_crops: List[CropOffsets] = [], val_crops: List[CropOffsets] = [], N: int = None) -> Tuple[List[SceneConfig], List[SceneConfig]]: train_scenes = [] val_scenes = [] with open(json_file, 'r') as f: for catalog_imagery in json.load(f): catalog = catalog_imagery.get('catalog') catalog = catalog.strip() catalog = f'{catalog_dir}/{catalog}' catalog = catalog.replace('s3://', '/vsizip/vsis3/') (labelss, imagerys) = hrefs_from_catalog( Catalog.from_file(root_of_tarball(catalog)), N) imagery_name = imagery = catalog_imagery.get('imagery') if imagery_name is not None: imagery = imagery.strip() imagery = f'{imagery_dir}/{imagery}' if '.zip' in imagery: imagery = imagery.replace('s3://', '/vsizip/vsis3/') else: imagery = imagery.replace('s3://', '/vsis3/') imagerys = [imagery] * len(labelss) else: imagerys = map(lambda i: i.replace('_rgb', ''), imagerys) if not imagery_dir.endswith('/'): imagery_dir = imagery_dir + '/' imagerys = list( map(lambda i: i.replace('./', imagery_dir), imagerys)) h = hashlib.sha256(catalog.encode()).hexdigest() del imagery print('imagery', imagerys) print('labels', labelss) make_scene = partial(hrefs_to_sceneconfig, class_id_filter_dict=class_id_filter_dict) for j, (labels, imagery) in enumerate(zip(labelss, imagerys)): for i, crop in enumerate(train_crops): scene = make_scene(name=f'{h}-train-{i}-{j}', extent_crop=crop, imagery=imagery, labels=labels) train_scenes.append(scene) for i, crop in enumerate(val_crops): scene = make_scene(name=f'{h}-val-{i}-{j}', extent_crop=crop, imagery=imagery, labels=labels) val_scenes.append(scene) return train_scenes, val_scenes
def test_when_a_service_completes_it_writes_a_output_catalog_to_the_output_dir( self): with cli_parser('--harmony-action', 'invoke', '--harmony-input', '{"test": "input"}', '--harmony-sources', 'example/source/catalog.json', '--harmony-metadata-dir', self.workdir) as parser: args = parser.parse_args() cli.run_cli(parser, args, MockAdapter, cfg=self.config) output = Catalog.from_file( os.path.join(self.workdir, 'catalog.json')) self.assertTrue(output.validate)
def do_test( self, catalog: pystac.Catalog, catalog_type: pystac.CatalogType ) -> None: with tempfile.TemporaryDirectory() as tmp_dir: catalog.normalize_hrefs(tmp_dir) self.validate_catalog(catalog) catalog.save(catalog_type=catalog_type) root_href = catalog.self_href self.validate_link_types(root_href, catalog_type) for parent, _, items in catalog.walk(): if issubclass(type(parent), Collection): stac_object_type = pystac.STACObjectType.COLLECTION else: stac_object_type = pystac.STACObjectType.CATALOG self.validate_file(parent.self_href, stac_object_type) for item in items: self.validate_file(item.self_href, pystac.STACObjectType.ITEM)
def test_reading_iterating_and_writing_works_as_expected(self): """ Test case to cover issue #88 """ stac_uri = 'tests/data-files/catalogs/test-case-6/catalog.json' cat = Catalog.from_file(stac_uri) # Iterate over the items. This was causing failure in # in the later iterations as per issue #88 for item in cat.get_all_items(): pass with TemporaryDirectory() as tmp_dir: new_stac_uri = os.path.join(tmp_dir, 'test-case-6') cat.normalize_hrefs(new_stac_uri) cat.save(catalog_type=CatalogType.SELF_CONTAINED) # Open the local copy and iterate over it. cat2 = Catalog.from_file(os.path.join(new_stac_uri, 'catalog.json')) for item in cat2.get_all_items(): # Iterate again over the items. This would fail in #88 pass
def main(ctx, input_path): # dump the CWL and params (if requested) dump(ctx) if 'TMPDIR' in os.environ: os.chdir(os.environ['TMPDIR']) logging.info(os.path.join(input_path, 'catalog.json')) item = get_item(os.path.join(input_path, 'catalog.json')) output_dir = f'{item.id}' calibrator = Calibrator() item_out = calibrator.calibrate(item) logging.info('STAC') cat = Catalog(id='catalog', description="Calibrated sar product") cat.add_items([item_out]) cat.normalize_and_save(root_href='./', catalog_type=CatalogType.SELF_CONTAINED) logging.info('Done!') #os.mkdir(output_dir) sys.exit(0)