def populate_index(dataset_loader, module_dea_index): """ Index populated with example datasets. Assumes our tests wont modify the data! It's module-scoped as it's expensive to populate. """ path, s2_product_doc = list( read_documents(TEST_DATA_DIR / "esa_s2_l2a.product.yaml"))[0] dataset_count = 0 product_ = module_dea_index.products.from_doc(s2_product_doc) module_dea_index.products.add(product_) create_dataset = Doc2Dataset(module_dea_index) for _, s2_dataset_doc in read_documents(TEST_DATA_DIR / "s2_l2a-sample.yaml"): try: dataset, err = create_dataset(s2_dataset_doc, "file://example.com/test_dataset/") assert dataset is not None, err created = module_dea_index.datasets.add(dataset) assert created.type.name == "s2_l2a" dataset_count += 1 except AttributeError as ae: assert dataset_count == 5 print(ae) assert dataset_count == 5 return module_dea_index
def test_read_documents(sample_document_files): for filename, ndocs in sample_document_files: all_docs = list(read_documents(filename)) assert len(all_docs) == ndocs for path, doc in all_docs: assert isinstance(doc, dict) assert isinstance(path, pathlib.Path) assert set(str(f) for f, _ in all_docs) == set([filename]) for filename, ndocs in sample_document_files: all_docs = list(read_documents(filename, uri=True)) assert len(all_docs) == ndocs for uri, doc in all_docs: assert isinstance(doc, dict) assert isinstance(uri, str) p = pathlib.Path(filename) if ndocs > 1: expect_uris = [p.as_uri() + '#part={}'.format(i) for i in range(ndocs)] else: expect_uris = [p.as_uri()] assert [f for f, _ in all_docs] == expect_uris
def _read_documents_impl(sample_document_files): # Test case for returning native points to documents, may be pathlib.Path or URI for filepath, num_docs in sample_document_files: all_docs = list(read_documents(filepath)) assert len(all_docs) == num_docs for path, doc in all_docs: assert isinstance(doc, dict) assert set(str(f) for f, _ in all_docs) == set([filepath]) # Test case for returning URIs pointing to documents for filepath, num_docs in sample_document_files: all_docs = list(read_documents(filepath, uri=True)) assert len(all_docs) == num_docs for uri, doc in all_docs: assert isinstance(doc, dict) assert isinstance(uri, str) url = as_url(filepath) if num_docs > 1: expect_uris = [ as_url(url) + '#part={}'.format(i) for i in range(num_docs) ] else: expect_uris = [as_url(url)] assert [f for f, _ in all_docs] == expect_uris
def init_dea(index: Index, with_permissions: bool, log_header=print_header, log=print_): """ Create or update a DEA configured ODC instance. """ log_header(f"ODC init of {index.url}") was_created = index.init_db(with_default_types=False, with_permissions=with_permissions) if was_created: log('Created.') else: log('Updated.') log('Checking indexes/views.') index.metadata_types.check_field_indexes( allow_table_lock=True, rebuild_indexes=False, rebuild_views=True, ) log_header('Checking DEA metadata types') # Add DEA metadata types, products. for _, md_type_def in read_documents(DEA_MD_TYPES): md = index.metadata_types.add( index.metadata_types.from_doc(md_type_def)) log(f"{md.name}") log_header('Checking DEA products') for _, product_def in read_documents(*DEA_PRODUCTS_DIR.glob('*.yaml')): product = index.products.add_document(product_def) log(f"{product.name}") log_header('Checking DEA ingested definitions') for path in DEA_INGESTION_DIR.glob('*.yaml'): ingest_config = ingest.load_config_from_file(path) driver_name = ingest_config['storage']['driver'] driver = storage_writer_by_name(driver_name) if driver is None: raise ValueError("No driver found for {}".format(driver_name)) source_type, output_type = ingest.ensure_output_type( index, ingest_config, driver.format, allow_product_changes=True) log(f"{output_type.name:<20}\t\t← {source_type.name}")
def index_cmd(index, match_rules, dtype, auto_match, dry_run, datasets): if not (match_rules or dtype or auto_match): _LOG.error('Must specify one of [--match-rules, --type, --auto-match]') return if match_rules: rules = load_rules_from_file(match_rules, index) else: assert dtype or auto_match rules = load_rules_from_types(index, dtype) if rules is None: return for dataset_path in datasets: metadata_path = get_metadata_path(Path(dataset_path)) if not metadata_path or not metadata_path.exists(): raise ValueError('No supported metadata docs found for dataset {}'.format(dataset_path)) for metadata_path, metadata_doc in read_documents(metadata_path): uri = metadata_path.absolute().as_uri() try: dataset = match_dataset(metadata_doc, uri, rules) except RuntimeError as e: _LOG.error('Unable to create Dataset for %s: %s', uri, e) continue if not check_dataset_consistent(dataset): _LOG.error("Dataset measurements don't match it's type specification %s", dataset.id) continue _LOG.info('Matched %s', dataset) if not dry_run: index.datasets.add(dataset)
def load_datasets(datasets, rules): for dataset_path in datasets: metadata_path = get_metadata_path(Path(dataset_path)) if not metadata_path or not metadata_path.exists(): _LOG.error('No supported metadata docs found for dataset %s', dataset_path) continue try: for metadata_path, metadata_doc in read_documents(metadata_path): uri = metadata_path.absolute().as_uri() try: dataset = create_dataset(metadata_doc, uri, rules) except BadMatch as e: _LOG.error('Unable to create Dataset for %s: %s', uri, e) continue is_consistent, reason = check_dataset_consistent(dataset) if not is_consistent: _LOG.error("Dataset %s inconsistency: %s", dataset.id, reason) continue yield dataset except InvalidDocException: _LOG.error("Failed reading documents from %s", metadata_path) continue
def load_dataset_definition(path): if not isinstance(path, pathlib.Path): path = pathlib.Path(path) fname = get_metadata_path(path) for _, doc in read_documents(fname): return SimpleDocNav(doc)
def update_metadata_types(index: Index, allow_unsafe: bool, allow_exclusive_lock: bool, dry_run: bool, files: List): """ Update existing metadata types. An error will be thrown if a change is potentially unsafe. (An unsafe change is anything that may potentially make the metadata type incompatible with existing types of the same name) """ for descriptor_path, parsed_doc in read_documents(*files): try: type_ = index.metadata_types.from_doc(parsed_doc) except InvalidDocException as e: _LOG.exception(e) _LOG.error('Invalid metadata type definition: %s', descriptor_path) continue if not dry_run: index.metadata_types.update( type_, allow_unsafe_updates=allow_unsafe, allow_table_lock=allow_exclusive_lock, ) echo('Updated "%s"' % type_.name) else: can_update, safe_changes, unsafe_changes = index.metadata_types.can_update( type_, allow_unsafe_updates=allow_unsafe) if can_update: echo('Can update "%s": %s unsafe changes, %s safe changes' % (type_.name, len(unsafe_changes), len(safe_changes))) else: echo('Cannot update "%s": %s unsafe changes, %s safe changes' % (type_.name, len(unsafe_changes), len(safe_changes)))
def load_config_from_file(path): config_file = Path(path) _, config = next(read_documents(config_file)) IngestorConfig.validate(config) config['filename'] = str(normalise_path(config_file)) return config
def doc_path_stream(files, on_error, uri=True): for fname in files: try: for p, doc in read_documents(fname, uri=uri): yield p, SimpleDocNav(doc) except InvalidDocException as e: on_error(fname, e)
def add_cop_dem_product(dc: Datacube, product): if product in PRODUCTS.keys(): product_uri = PRODUCTS[product] else: raise ValueError(f"Unknown product {product}") for _, doc in read_documents(product_uri): dc.index.products.add_document(doc) print(f"Product definition added for {product}")
def _path_dataset_ids(path: Path) -> Iterable[uuid.UUID]: for _, metadata_doc in read_documents(path): if metadata_doc is None: raise InvalidDocException("Empty document from path {}".format(path)) if 'id' not in metadata_doc: raise InvalidDocException("No id in path metadata: {}".format(path)) yield uuid.UUID(metadata_doc['id'])
def main(index, stats_config_file, qsub, runner, save_tasks, load_tasks, tile_index, tile_index_file, output_location, year, task_slice, batch): if qsub is not None and batch is not None: for i in range(batch): child = qsub.clone() child.reset_internal_args() child.add_internal_args('--task-slice', '{}::{}'.format(i, batch)) click.echo(repr(child)) exit_code, _ = child(auto=True, auto_clean=[('--batch', 1)]) if exit_code != 0: return exit_code return 0 elif qsub is not None: # TODO: verify config before calling qsub submit click.echo(repr(qsub)) exit_code, _ = qsub(auto=True) return exit_code _log_setup() timer = MultiTimer().start('main') if len(tile_index) == 0: tile_index = None _, config = next(read_documents(stats_config_file)) stats_schema(config) app = StatsApp.from_configuration_file( config, index, gather_tile_indexes(tile_index, tile_index_file), output_location, year) app.validate() if save_tasks: app.save_tasks_to_file(save_tasks) failed = 0 elif load_tasks: successful, failed = app.run(runner, task_file=load_tasks, task_slice=task_slice) else: successful, failed = app.run(runner, task_slice=task_slice) timer.pause('main') _LOG.info('Stats processing completed in %s seconds.', timer.run_times['main']) if failed > 0: raise click.ClickException( '%s of %s tasks were not completed successfully.' % (failed, successful + failed)) return 0
def load_config(index, app_config_file, make_config, make_tasks, *args, **kwargs): app_config_path = Path(app_config_file) _, config = next(read_documents(app_config_path)) config['app_config_file'] = app_config_path.name config = make_config(index, config, **kwargs) tasks = make_tasks(index, config, **kwargs) return config, iter(tasks)
def path_doc_stream(files, on_error, uri=True, raw=False): maybe_wrap = {True: lambda x: x, False: SimpleDocNav}[raw] for fname in files: try: for p, doc in read_documents(fname, uri=uri): yield p, maybe_wrap(doc) except InvalidDocException as e: on_error(fname, e)
def main(index, app_config, year, executor): _, config = next(read_documents(app_config)) tasks = make_tasks(index, config) futures = [executor.submit(do_stats, task, config) for task in tasks] for future in executor.as_completed(futures): result = executor.result(future) print(result)
def main(index, app_config, year, executor): _, config = next(read_documents(app_config)) tasks = make_tasks(index, config) futures = [executor.submit(do_stats, task, config) for task in tasks] for future in executor.as_completed(futures): result = executor.result(future) print(result)
def read_document(path: Path) -> dict: """ Read and parse exactly one document. """ ds = list(read_documents(path)) if len(ds) != 1: raise NotImplementedError("Expected one document to be in path %s" % path) _, doc = ds[0] return doc
def read_document(path: Path) -> dict: """ Read and parse exactly one document. """ ds = list(read_documents(path)) if len(ds) != 1: raise ValueError(f"Expected only one document to be in path {path}") _, doc = ds[0] return doc
def _path_doc_stream(files, on_error, uri=True, raw=False): """See :func:`ui_path_doc_stream` for documentation""" maybe_wrap = identity if raw else SimpleDocNav for fname in files: try: for p, doc in read_documents(fname, uri=uri): yield p, maybe_wrap(doc) except InvalidDocException as e: on_error(fname, e)
def check_dataset_metadata_in_storage_unit(nco, dataset_dir): assert len(nco.variables['dataset']) == 1 # 1 time slice stored_metadata = nco.variables['dataset'][0] if not isinstance(stored_metadata, str): stored_metadata = netCDF4.chartostring(stored_metadata) stored_metadata = str(np.char.decode(stored_metadata)) ds_filename = dataset_dir / 'agdc-metadata.yaml' stored = yaml.safe_load(stored_metadata) [(_, original)] = read_documents(ds_filename) assert len(stored['lineage']['source_datasets']) == 1 assert next(iter(stored['lineage']['source_datasets'].values())) == original
def ingest_cmd(index, config, dry_run, executor): _, config = next(read_documents(Path(config))) source_type = index.datasets.types.get_by_name(config['source_type']) if not source_type: _LOG.error("Source DatasetType %s does not exist", config['source_type']) # print (source_type) # print ("abcdefghijklmnopqrstuvwxyz") output_type = morph_dataset_type(source_type, config) # print (output_type) _LOG.info('Created DatasetType %s', output_type.name) output_type = index.datasets.types.add(output_type) datacube = Datacube(index=index) grid_spec = output_type.grid_spec namemap = get_namemap(config) measurements = get_measurements(source_type, config) variable_params = get_variable_params(config) file_path_template = str( Path(config['location'], config['file_path_template'])) bbox = BoundingBox(**config['ingestion_bounds']) tasks = find_diff(source_type, output_type, bbox, datacube) def ingest_work(tile_index, sources): geobox = GeoBox.from_grid_spec(grid_spec, tile_index) # print ("in ingest.py in ingest_word") data = Datacube.product_data(sources, geobox, measurements) nudata = data.rename(namemap) file_path = file_path_template.format( tile_index=tile_index, start_time=to_datetime( sources.time.values[0]).strftime('%Y%m%d%H%M%S%f'), end_time=to_datetime( sources.time.values[-1]).strftime('%Y%m%d%H%M%S%f')) # TODO: algorithm params print("Writing product") nudatasets = write_product(nudata, sources, output_type, config['global_attributes'], variable_params, Path(file_path)) return nudatasets do_work(tasks, ingest_work, index, executor) temp = str(Path(config['location'])) files_path = temp + "/cache" if not os.path.isfile(temp + "/archive"): os.system("mkdir " + temp + "/archive") print("Compressing files") compress(files_path)
def add_metadata_types(index, files): """ Add or update metadata types in the index """ for descriptor_path, parsed_doc in read_documents(*(Path(f) for f in files)): try: type_ = index.metadata_types.from_doc(parsed_doc) index.metadata_types.add(type_) except InvalidDocException as e: _LOG.exception(e) _LOG.error('Invalid metadata type definition: %s', descriptor_path) continue
def add_dataset_types(index, files): """ Add product types to the index """ for descriptor_path, parsed_doc in read_documents(*(Path(f) for f in files)): try: type_ = index.datasets.types.from_doc(parsed_doc) index.datasets.types.add(type_) echo('Added "%s"' % type_.name) except InvalidDocException as e: _LOG.exception(e) _LOG.error('Invalid product definition: %s', descriptor_path) continue
def add_metadata_types(index, allow_exclusive_lock, files): # type: (Index, bool, list) -> None """ Add or update metadata types in the index """ for descriptor_path, parsed_doc in read_documents(*(Path(f) for f in files)): try: type_ = index.metadata_types.from_doc(parsed_doc) index.metadata_types.add(type_, allow_table_lock=allow_exclusive_lock) except InvalidDocException as e: _LOG.exception(e) _LOG.error('Invalid metadata type definition: %s', descriptor_path) continue
def add_products(index, allow_exclusive_lock, files): # type: (Index, bool, list) -> None """ Add or update products in the generic index. """ for descriptor_path, parsed_doc in read_documents(*files): try: type_ = index.products.from_doc(parsed_doc) index.products.add(type_, allow_table_lock=allow_exclusive_lock) echo('Added "%s"' % type_.name) except InvalidDocException as e: _LOG.exception(e) _LOG.error('Invalid product definition: %s', descriptor_path) sys.exit(1)
def add_dataset_types(index, files): """ Add product types to the index """ for descriptor_path, parsed_doc in read_documents(*(Path(f) for f in files)): try: type_ = index.products.from_doc(parsed_doc) index.products.add(type_) echo('Added "%s"' % type_.name) except InvalidDocException as e: _LOG.exception(e) _LOG.error('Invalid product definition: %s', descriptor_path) continue
def update_dataset_types(index, allow_unsafe, allow_exclusive_lock, dry_run, files): # type: (Index, bool, bool, bool, list) -> None """ Update existing products. An error will be thrown if a change is potentially unsafe. (An unsafe change is anything that may potentially make the product incompatible with existing datasets of that type) """ failures = 0 for descriptor_path, parsed_doc in read_documents(*(Path(f) for f in files)): try: type_ = index.products.from_doc(parsed_doc) except InvalidDocException as e: _LOG.exception(e) _LOG.error('Invalid product definition: %s', descriptor_path) failures += 1 continue if not dry_run: try: index.products.update( type_, allow_unsafe_updates=allow_unsafe, allow_table_lock=allow_exclusive_lock, ) echo('Updated "%s"' % type_.name) except ValueError as e: echo('Failed to update "%s": %s' % (type_.name, e)) failures += 1 else: can_update, safe_changes, unsafe_changes = index.products.can_update(type_, allow_unsafe_updates=allow_unsafe) for offset, old_val, new_val in safe_changes: echo('Safe change in %r %s from %r to %r' % (type_.name, _readable_offset(offset), old_val, new_val)) for offset, old_val, new_val in unsafe_changes: echo('Unsafe change in %r %s from %r to %r' % (type_.name, _readable_offset(offset), old_val, new_val)) if can_update: echo('Can update "%s": %s unsafe changes, %s safe changes' % (type_.name, len(unsafe_changes), len(safe_changes))) else: echo('Cannot update "%s": %s unsafe changes, %s safe changes' % (type_.name, len(unsafe_changes), len(safe_changes))) sys.exit(failures)
def load_rules_from_file(filename, index): rules = next(read_documents(Path(filename)))[1] # TODO: verify schema for rule in rules: type_ = index.products.get_by_name(rule['type']) if not type_: _LOG.error('DatasetType %s does not exists', rule['type']) return if not changes.contains(type_.metadata_doc, rule['metadata']): _LOG.error('DatasetType %s can\'t be matched by its own rule', rule['type']) return rule['type'] = type_ return rules
def load_rules_from_file(filename, index): rules = next(read_documents(Path(filename)))[1] # TODO: verify schema for rule in rules: type_ = index.products.get_by_name(rule['type']) if not type_: _LOG.error('DatasetType %s does not exists', rule['type']) return if not contains(type_.metadata_doc, rule['metadata']): _LOG.error('DatasetType %s can\'t be matched by its own rule', rule['type']) return rule['type'] = type_ return rules
def _test_read_docs_impl(sample_documents: Iterable[Tuple[str, int]]): # Test case for returning URIs pointing to documents for doc_url, num_docs in sample_documents: all_docs = list(read_documents(doc_url, uri=True)) assert len(all_docs) == num_docs for uri, doc in all_docs: assert isinstance(doc, dict) assert isinstance(uri, str) url = as_url(doc_url) if num_docs > 1: expect_uris = [as_url(url) + '#part={}'.format(i) for i in range(num_docs)] else: expect_uris = [as_url(url)] assert [f for f, _ in all_docs] == expect_uris
def ingest_cmd(index, config, dry_run, executor): _, config = next(read_documents(Path(config))) source_type = index.datasets.types.get_by_name(config['source_type']) if not source_type: _LOG.error("Source DatasetType %s does not exist", config['source_type']) # print (source_type) # print ("abcdefghijklmnopqrstuvwxyz") output_type = morph_dataset_type(source_type, config) # print (output_type) _LOG.info('Created DatasetType %s', output_type.name) output_type = index.datasets.types.add(output_type) datacube = Datacube(index=index) grid_spec = output_type.grid_spec namemap = get_namemap(config) measurements = get_measurements(source_type, config) variable_params = get_variable_params(config) file_path_template = str(Path(config['location'], config['file_path_template'])) bbox = BoundingBox(**config['ingestion_bounds']) tasks = find_diff(source_type, output_type, bbox, datacube) def ingest_work(tile_index, sources): geobox = GeoBox.from_grid_spec(grid_spec, tile_index) # print ("in ingest.py in ingest_word") data = Datacube.product_data(sources, geobox, measurements) nudata = data.rename(namemap) file_path = file_path_template.format(tile_index=tile_index, start_time=to_datetime(sources.time.values[0]).strftime('%Y%m%d%H%M%S%f'), end_time=to_datetime(sources.time.values[-1]).strftime('%Y%m%d%H%M%S%f')) # TODO: algorithm params print ("Writing product") nudatasets = write_product(nudata, sources, output_type, config['global_attributes'], variable_params, Path(file_path)) return nudatasets do_work(tasks, ingest_work, index, executor) temp = str(Path(config['location'])) files_path = temp + "/cache" if not os.path.isfile(temp+"/archive"): os.system("mkdir "+temp+"/archive") print ("Compressing files") compress(files_path)
def check_dataset_metadata_in_storage_unit(nco, dataset_dirs): """Check one of the NetCDF files metadata against the original metadata.""" assert len(nco.variables['dataset']) == 1 # 1 time slice stored_metadata = netcdf_extract_string(nco.variables['dataset'][0]) stored = yaml.safe_load(stored_metadata) assert 'lineage' in stored assert 'source_datasets' in stored['lineage'] assert '0' in stored['lineage']['source_datasets'] assert 'id' in stored['lineage']['source_datasets']['0'] source_uuid = UUID(stored['lineage']['source_datasets']['0']['id']) assert source_uuid in dataset_dirs ds_filename = dataset_dirs[source_uuid] / 'agdc-metadata.yaml' [(_, original)] = read_documents(ds_filename) assert len(stored['lineage']['source_datasets']) == 1 assert next(iter(stored['lineage']['source_datasets'].values())) == original
def _populate_from_dump(session_dea_index, expected_type: str, dump_path: Path): ls8_nbar_scene = session_dea_index.products.get_by_name(expected_type) dataset_count = 0 create_dataset = Doc2Dataset(session_dea_index) for _, doc in read_documents(dump_path): label = doc["ga_label"] if ("ga_label" in doc) else doc["id"] dataset, err = create_dataset( doc, f"file://example.com/test_dataset/{label}") assert dataset is not None, err created = session_dea_index.datasets.add(dataset) assert created.type.name == ls8_nbar_scene.name dataset_count += 1 print(f"Populated {dataset_count} of {expected_type}") return dataset_count
def load_config_from_file(index, config): config_name = Path(config).name _, config = next(read_documents(Path(config))) config['filename'] = config_name return config
def decorate(cls): cls.schema = next(iter(read_documents(SCHEMA_PATH/schema)))[1] cls.validate = classmethod(validate) return cls