Esempio n. 1
0
def load_config_from_file(path):
    config_file = Path(path)
    _, config = next(read_documents(config_file))
    IngestorConfig.validate(config)
    config['filename'] = str(normalise_path(config_file))

    return config
Esempio n. 2
0
def resolve_location(path: Location) -> str:
    """
    Make sure a dataset location is a URL, suitable to be
    the dataset_location in datacube indexing.

    Users may specify a pathlib.Path(), and we'll convert it as needed.
    """
    if isinstance(path, str):
        if not dc_uris.is_url(path) and not dc_uris.is_vsipath(path):
            raise ValueError(
                "A string location is expected to be a URL or VSI path. "
                "Perhaps you want to give it as a local pathlib.Path()?")
        return path

    path = dc_uris.normalise_path(path)
    if ".tar" in path.suffixes:
        return f"tar:{path}!/"
    elif ".zip" in path.suffixes:
        return f"zip:{path}!/"
    else:
        uri = path.as_uri()
        # Base paths specified as directories must end in a slash,
        # so they will be url joined as subfolders. (pathlib strips them)
        if path.is_dir():
            return f"{uri}/"
        return uri
def test_normalise_path():
    cwd = Path('.').resolve()
    assert normalise_path('.').resolve() == cwd

    p = Path('/a/b/c/d.txt')
    assert normalise_path(p) == Path(p)
    assert normalise_path(str(p)) == Path(p)

    base = Path('/a/b/')
    p = Path('c/d.txt')
    assert normalise_path(p, base) == (base / p)
    assert normalise_path(str(p), str(base)) == (base / p)
    assert normalise_path(p) == (cwd / p)

    with pytest.raises(ValueError):
        normalise_path(p, 'not/absolute/path')
Esempio n. 4
0
    def mk_uri(self, file_path, storage_config):
        """
        Constructs a URI from the file_path and storage config.

        A typical implementation should return f'{scheme}://{file_path}'

        Example:
            file_path = '/path/to/my_file.nc'
            storage_config = {'driver': 'NetCDF CF'}

            mk_uri(file_path, storage_config) should return 'file:///path/to/my_file.nc'

        :param Path file_path: The file path of the file to be converted into a URI.
        :param dict storage_config: The dict holding the storage config found in the ingest definition.
        :return: file_path as a URI that the Driver understands.
        :rtype: str
        """
        return normalise_path(file_path).as_uri()
Esempio n. 5
0
 def mk_uri(file_path):
     if driver.uri_scheme == "file":
         return normalise_path(file_path).as_uri()
     return '{}://{}'.format(driver.uri_scheme, file_path)
Esempio n. 6
0
def main(
    local_config: LocalConfig,
    output_base: Optional[Path],
    input_relative_to: Optional[Path],
    datasets: Tuple[Path],
    datasets_path: Optional[Path],
    provider: Optional[str],
    overwrite_existing: bool,
    verbose: bool,
    workers: int,
    thoroughly_check_existing: bool,
    embed_location: Optional[bool],
    only_regions_in_file: Optional[Path],
    before_month: Optional[Tuple[int, int]],
    after_month: Optional[Tuple[int, int]],
    dry_run: bool,
    always_granule_id: Optional[bool],
    index_to_odc: bool,
):
    if sys.argv[1] == "sentinel-l1c":
        warnings.warn(
            "Command name 'sentinel-l1c-prepare' is deprecated: remove the 'c', and use `sentinel-l1-prepare`"
        )

    included_regions = None
    if only_regions_in_file:
        included_regions = set(only_regions_in_file.read_text().splitlines())

    if datasets_path:
        datasets = [
            *datasets,
            *(normalise_path(p.strip())
              for p in (datasets_path.read_text().splitlines())),
        ]

    _LOG.info("kickoff", path_count=len(datasets), worker_count=workers)

    # Are we indexing on success?
    index = None
    if index_to_odc:
        _LOG.info("Indexing new datasets", local_config=local_config)
        index = index_connect(local_config, application_name="s2-prepare")
        products = {}

        def on_success(dataset: DatasetDoc, dataset_path: Path):
            """
            Index the dataset
            """
            product_name = dataset.product.name
            product = products.get(product_name)
            if not product:
                product = index.products.get_by_name(product_name)
                if not product:
                    raise ValueError(
                        f"Product {product_name} not found in ODC index")
                products[product_name] = product

            index.datasets.add(
                Dataset(product,
                        serialise.to_doc(dataset),
                        uris=dataset.locations))
            _LOG.debug("Indexed dataset",
                       dataset_id=dataset.id,
                       dataset_path=dataset_path)

    else:

        def on_success(dataset: DatasetDoc, dataset_path: Path):
            """Nothing extra"""
    def find_inputs_in_path(input_path: Path) -> Iterable[InputDataset]:
        """
        Scan the input path for our key identifying files of a package.
        """
        found_something = False
        if provider == "sinergise.com" or not provider:
            for p in _rglob_with_self(input_path, "tileInfo.json"):
                found_something = True
                yield InputDataset(
                    producer="sinergise.com",
                    # Dataset location is the metadata file itself.
                    path=p,
                    # Output is a sibling metadata file, with the same name as the folder (usually S2A....).
                    base_folder=p.parent.parent,
                    name=p.parent.stem,
                )
        if provider == "esa.int" or not provider:
            for p in _rglob_with_self(input_path, "*.zip"):
                found_something = True
                yield InputDataset(
                    producer="esa.int",
                    # Dataset location is the zip file
                    path=p,
                    # Metadata is a sibling file with a metadata suffix.
                    base_folder=p.parent,
                    name=p.stem,
                )
        if not found_something:
            raise ValueError(
                f"No S2 datasets found in given path {input_path}. "
                f"Expected either Sinergise (productInfo.json) files or ESA zip files to be contained in it."
            )

    def find_jobs() -> Iterable[Job]:

        region_lookup = RegionLookup()

        nonlocal input_relative_to, embed_location
        for input_path in datasets:

            first = True
            for found_dataset in find_inputs_in_path(input_path):
                _LOG.debug("found_dataset", name=found_dataset.name)
                # Make sure we tick progress on extra datasets that were found.
                if not first:
                    first = False

                # Filter based on metadata
                info = found_dataset.metadata

                # Skip regions that are not in the limit?
                if included_regions or before_month or after_month:
                    if info is None:
                        raise ValueError(
                            f"Cannot filter from non-standard folder layout: {found_dataset.path} "
                            f" expected of form L1C/yyyy/yyyy-mm/area/S2_..")

                    if included_regions:
                        # If it's an older dataset without a region, try to map its area to a known region.
                        if info.region_code is None:
                            for region in region_lookup.get(info.area):
                                if region in included_regions:
                                    _LOG.debug(
                                        "mapped_area_match",
                                        input_area=info.area,
                                        region_match=region,
                                    )
                                    break
                            else:
                                _LOG.debug(
                                    "skipping.mapped_area_not_in_regions",
                                    input_area=info.area,
                                )
                                continue
                        elif info.region_code not in included_regions:
                            _LOG.debug(
                                "skipping.region_not_in_region_list",
                                region_code=info.region_code,
                            )
                            continue

                    if after_month is not None:
                        year, month = after_month

                        if info.year < year or (info.year == year
                                                and info.month < month):
                            _LOG.debug(
                                "skipping.too_old",
                                dataset_year_month=(info.year, info.month),
                                max_year_month=(year, month),
                            )
                            continue
                    if before_month is not None:
                        year, month = before_month

                        if info.year > year or (info.year == year
                                                and info.month > month):
                            _LOG.debug(
                                "skipping.too_young",
                                dataset_year_month=(info.year, info.month),
                                min_year_month=(year, month),
                            )
                            continue

                # Put outputs in a different folder?
                if output_base:
                    # What base folder should we choose for creating subfolders in the output?
                    if input_relative_to is None:
                        input_relative_to = _get_default_relative_folder_base(
                            found_dataset.base_folder)

                    output_folder = output_base / found_dataset.base_folder.relative_to(
                        input_relative_to)
                    # Default to true.
                    if embed_location is None:
                        embed_location = True
                else:
                    output_folder = found_dataset.base_folder
                    # Default to false
                    if embed_location is None:
                        embed_location = False

                # It's very slow to read the list of inner granules.
                #
                # So, if we're not thoroughly checking for missing outputs.
                if ((not thoroughly_check_existing)
                        # ... and any outputs exist at all
                        and list(
                            output_folder.glob(
                                f"{found_dataset.name}*.odc-metadata.yaml"))
                        # ... and we're not overwriting our outputs
                        and not overwrite_existing):
                    # Skip it!
                    _LOG.debug(
                        "At least one output exists: skipping.",
                        dataset_name=found_dataset.name,
                    )
                    continue

                # This has to read the files, so can be slow. That's why we try to skip above if possible.
                granule_ids = found_dataset.granule_ids

                # When granule_id is None, it means process all without filtering.
                if not granule_ids:
                    granule_ids = [None]
                else:
                    _LOG.debug("found_granules",
                               granule_count=len(granule_ids))

                for granule_id in granule_ids:
                    if always_granule_id or (
                            # None means 'auto': ie. automatically include granule id when there are multiple granules
                            always_granule_id is None
                            and len(granule_ids) > 1):
                        yaml_filename = (
                            f"{found_dataset.name}.{granule_id}.odc-metadata.yaml"
                        )
                    else:
                        yaml_filename = f"{found_dataset.name}.odc-metadata.yaml"

                    output_yaml = output_folder / yaml_filename
                    if output_yaml.exists():
                        if not overwrite_existing:
                            _LOG.debug("Output exists: skipping.",
                                       output_yaml=output_yaml)
                            continue

                        _LOG.debug("Output exists: overwriting.",
                                   output_yaml=output_yaml)

                    _LOG.info(
                        "queued",
                        dataset_name=found_dataset.name,
                        granule=granule_id or "any",
                    )
                    yield Job(
                        dataset_path=found_dataset.path,
                        output_yaml_path=output_yaml,
                        producer=found_dataset.producer,
                        granule_id=granule_id,
                        embed_location=embed_location,
                    )

    errors = 0

    if dry_run:
        _LOG.info("Dry run: not writing any files.")

    # If only one process, call it directly.
    # (Multiprocessing makes debugging harder, so we prefer to make it optional)
    successes = 0

    try:
        if workers == 1 or dry_run:
            for job in find_jobs():
                try:
                    if dry_run:
                        _LOG.info(
                            "Would write dataset",
                            dataset_path=job.dataset_path,
                            output_yaml_path=job.output_yaml_path,
                        )
                    else:
                        dataset, path = prepare_and_write(
                            job.dataset_path,
                            job.output_yaml_path,
                            job.producer,
                            granule_id=job.granule_id,
                            embed_location=job.embed_location,
                        )
                        _LOG.info("Wrote dataset",
                                  dataset_id=dataset.id,
                                  datsaset_path=path)
                        on_success(dataset, path)
                    successes += 1
                except Exception:
                    _LOG.exception("failed_job", job=job)
                    errors += 1
        else:
            with Pool(processes=workers) as pool:
                for res in pool.imap_unordered(_write_dataset_safe,
                                               find_jobs()):
                    if isinstance(res, str):
                        _LOG.error(res)
                        errors += 1
                    else:
                        dataset, path = res
                        _LOG.info("Wrote dataset",
                                  dataset_id=dataset.id,
                                  dataset_path=path)
                        on_success(dataset, path)
                        successes += 1
                pool.close()
                pool.join()
    finally:
        if index is not None:
            index.close()

    _LOG.info("completed", success_count=successes, failure_count=errors)
    sys.exit(errors)