Exemple #1
0
def _paths_to_tasks(input_paths: List[Path]) -> List[Task]:
    # Remove duplicates
    normalised_input_paths = set(p.absolute() for p in input_paths)

    def dataset_folder_path(dataset_path):
        # Get their dataset's parent folders: (typically the "x_y" for tiles, the month for scenes)

        # Get the base path for the dataset.
        # eg. "LS8_SOME_SCENE_1/ga-metadata.yaml" to "LS8_SOME_SCENE_1"
        #  or "LS7_SOME_TILE.nc" to itself
        base_path, _ = get_dataset_paths(dataset_path)

        return base_path.parent

    parent_folder_counts = uniq_counts(dataset_folder_path(dataset_path)
                                       for input_path in normalised_input_paths
                                       for collection in collections.get_collections_in_path(input_path)
                                       for dataset_path in collection.iter_fs_paths_within(input_path))

    # Sanity check: Each of these parent folders should still be within an input path
    for path, count in parent_folder_counts:
        if not any(str(path).startswith(str(input_path))
                   for input_path in normalised_input_paths):
            raise NotImplementedError("Giving a specific dataset rather than a folder of datasets?")

    return [Task([p], c) for p, c in parent_folder_counts]
Exemple #2
0
def get_collection(tile_path: Path) -> collections.Collection:
    """
    Get the collection that covers the given path
    """
    cs = list(collections.get_collections_in_path(tile_path))
    if not cs:
        raise click.UsageError("No collections found for path {}".format(tile_path))
    if len(cs) > 1:
        raise click.UsageError("Multiple collections found for path: too broad? {}".format(tile_path))
    collection = cs[0]
    return collection
Exemple #3
0
def resolve_collections(
        collection_specifiers: Iterable[str]
) -> List[Tuple[cs.Collection, str]]:
    """
    >>> cs.init_nci_collections(None)
    >>> [(c.name, p) for c, p in resolve_collections(['ls8_level1_scene'])]
    [('ls8_level1_scene', 'file:///')]
    >>> [(c.name, p) for c, p in resolve_collections(['/g/data/v10/repackaged/rawdata/0/2015'])]
    [('telemetry', 'file:///g/data/v10/repackaged/rawdata/0/2015')]
    >>> [(c.name, p) for c, p in resolve_collections(['/g/data/v10/reprocess/ls7/level1'])]
    [('ls7_level1_scene', 'file:///g/data/v10/reprocess/ls7/level1')]
    >>> level1_folder_match = resolve_collections(['/g/data/v10/reprocess'])
    >>> sorted(c.name for c, p in level1_folder_match)
    ['ls5_level1_scene', 'ls7_level1_scene', 'ls8_level1_scene']
    >>> set(p for c, p in level1_folder_match)
    {'file:///g/data/v10/reprocess'}
    >>> resolve_collections(['/some/fake/path'])
    Traceback (most recent call last):
    ...
    ValueError: Matches no collections: '/some/fake/path'
    >>> # Just the prefix, not the whole complete folder name
    >>> [(c.name, p) for c, p in resolve_collections(['/g/data/v10/repackaged/rawdata/0/20'])]
    Traceback (most recent call last):
    ...
    ValueError: Matches no collections: '/g/data/v10/repackaged/rawdata/0/20'
    """
    out = []
    for spec in collection_specifiers:
        # Either a collection name or a path on the filesystem

        possible_path = Path(spec).absolute()

        collection = cs.get_collection(spec)
        collections_in_path = list(cs.get_collections_in_path(possible_path))

        # If it matches both, throw an error
        if collections_in_path and collection is not None:
            raise ValueError("Ambiguous input: %r is both a "
                             "collection name and a path on the filesystem" %
                             (spec, ))

        if collection:
            out.append((collection, 'file:///'))
        elif collections_in_path:
            for match in collections_in_path:
                out.append((match, possible_path.as_uri()))
        else:
            raise ValueError("Matches no collections: %r" % spec)

    return out
Exemple #4
0
def cli(index, dry_run, paths, destination, checksum):
    """
    Move the given folder of datasets into the given destination folder.

    This will checksum the data, copy it to the destination, and mark the original as archived in the DEA index.


    Notes:

    * An operator can later run dea-clean to trash the archived original locations.

    * Source datasets with failing checksums will be left as-is, with a warning logged.

    * Both the source(s) and destination paths are expected to be paths containing existing DEA collections.
    (See collections.py and paths.py)
    """
    init_logging()
    init_nci_collections(index)

    if not is_base_directory(destination):
        raise click.BadArgumentUsage(
            'Not a known DEA base directory; {}\nExpected one of:\n\t{}'.
            format(destination, '\n\t'.join(BASE_DIRECTORIES)))

    # We want to iterate all datasets in the given input folder, so we find collections that exist in
    # that folder and then iterate through all the collection datasets within that folder. Simple :)

    # We do this aggressively to find errors in arguments immediately. (with the downside of `paths` memory usage)
    resulting_paths = []
    for input_path in map(Path, paths):
        collections = list(get_collections_in_path(input_path))
        if not collections:
            raise click.BadArgumentUsage(
                f"Directory doesn't match any known collections: {input_path}")

        for collection in collections:
            resulting_paths.extend(
                list(collection.iter_fs_paths_within(input_path)))

    _LOG.info("dataset.count",
              input_count=len(paths),
              dataset_count=len(resulting_paths))

    # TODO: @ui.executor_cli_options
    move_all(index,
             resulting_paths,
             Path(destination),
             dry_run=dry_run,
             checksum=checksum)