コード例 #1
0
ファイル: fixes.py プロジェクト: CEKrause/digitalearthau
def _trash_missing_dataset(mismatch: DatasetNotIndexed, index: Index):
    # If any (other) indexed datasets exist at the same location we can't trash it.
    datasets_at_location = list(get_datasets_for_uri(index, mismatch.uri))
    if datasets_at_location:
        _LOG.warning("do_trash_missing.indexed_siblings_exist",
                     uri=mismatch.uri)
        return

    trash_uri(mismatch.uri)
コード例 #2
0
ファイル: cleanup.py プロジェクト: CEKrause/digitalearthau
def trash_individual_files(index, dry_run, min_trash_age_hours, files):
    """
    Trash the given datasets if they're archived.

    This expects exact dataset paths: *.nc files, or ga-metadata.yaml for scenes.

    But only if they were archived more than  --min-trash-age-hours ago (default: 3 days)
    """
    glog = structlog.getLogger('cleanup-paths')
    glog.info('input_paths', input_paths=files)

    latest_time_to_archive = _as_utc(
        datetime.utcnow()) - timedelta(hours=min_trash_age_hours)

    count = 0
    trash_count = 0
    for file in files:
        count += 1
        log = glog.bind(path=(Path(file).resolve()))

        uri = Path(file).resolve().as_uri()
        datasets = list(index.datasets.get_datasets_for_location(uri))

        if datasets:
            # Can't trash file if any linked datasets are still active. They should be archived first.
            active_dataset_ids = [
                dataset.id for dataset in datasets if dataset.is_active
            ]
            if active_dataset_ids:
                log.warning('dataset.is_active',
                            dataset_ids=active_dataset_ids)
                continue

            assert all(d.is_archived for d in datasets)

            # Otherwise they're indexed and archived. Were they archived long enough ago?

            # It's rare that you'd have two archived datasets with the same location, but we're handling it anyway...
            archived_times = [dataset.archived_time for dataset in datasets]
            archived_times.sort()
            oldest_archived_time = archived_times[0]
            if _as_utc(oldest_archived_time) > latest_time_to_archive:
                log.info('dataset.too_recent',
                         archived_time=oldest_archived_time)
                continue

            if not dry_run:
                for d in datasets:
                    log.info('dataset.remove_location', dataset_id=d.id)
                    index.datasets.remove_location(d.id, uri)
        else:
            log.info('path.not_indexed')
        paths.trash_uri(uri, dry_run=dry_run, log=log)
        trash_count += 1

    glog.info("cleanup.finish", count=count, trash_count=trash_count)
コード例 #3
0
ファイル: cleanup.py プロジェクト: CEKrause/digitalearthau
def _cleanup_datasets(index: Index, product: DatasetType,
                      datasets: Iterable[Dataset], expected_count: int,
                      dry_run: bool, latest_time_to_archive: datetime,
                      only_redundant: bool, log):
    count = 0
    trash_count = 0

    log.info("cleanup.product.start",
             expected_count=expected_count,
             product=product.name)

    with click.progressbar(
            datasets,
            length=expected_count,
            # stderr should be used for runtime information, not stdout
            file=sys.stderr) as dataset_iter:
        for dataset in dataset_iter:
            count += 1

            log = log.bind(dataset_id=str(dataset.id))

            archived_uri_times = index.datasets.get_archived_location_times(
                dataset.id)
            if not archived_uri_times:
                log.debug('dataset.nothing_archived')
                continue

            if only_redundant:
                if dataset.uris is not None and len(dataset.uris) == 0:
                    # This active dataset has no active locations to replace the ones we're archiving.
                    # Probably a mistake? Don't trash the archived ones yet.
                    log.warning("dataset.noactive",
                                archived_paths=archived_uri_times)
                    continue

            for uri, archived_time in archived_uri_times:
                if _as_utc(archived_time) > latest_time_to_archive:
                    log.info('dataset.too_recent')
                    continue

                paths.trash_uri(uri, dry_run=dry_run, log=log)
                if not dry_run:
                    index.datasets.remove_location(dataset.id, uri)
                trash_count += 1

            log = log.unbind('dataset_id')

    log.info("cleanup.product.finish", count=count, trash_count=trash_count)
    return count, trash_count
コード例 #4
0
ファイル: cleanup.py プロジェクト: sixy6e/digitalearthau
def _cleanup_uri(dry_run: bool, index: Index, input_uri: str,
                 min_trash_age_hours: int, log):
    trash_count = 0

    latest_time_to_archive = _as_utc(
        datetime.utcnow()) - timedelta(hours=min_trash_age_hours)

    echo(
        f"Cleaning {'(dry run) ' if dry_run else ''}{style(input_uri, bold=True)}",
        err=True)

    locations = _get_archived_locations_within(index, latest_time_to_archive,
                                               input_uri)
    echo(
        f"  {len(locations)} locations archived more than {min_trash_age_hours}hr ago",
        err=True)
    with click.progressbar(
            locations,
            # stderr should be used for runtime information, not stdout
            file=sys.stderr) as location_iter:
        for uri in location_iter:
            log = log.bind(uri=uri)
            local_path = uri_to_local_path(uri)
            if not local_path.exists():
                # An index record exists, but the file isn't on the disk.
                # We won't remove the record from the index: maybe the filesystem is temporarily unmounted?
                log.warning('location.not_exist')
                continue

            # Multiple datasets can point to the same location (eg. a stacked file).
            indexed_datasets = set(
                index.datasets.get_datasets_for_location(uri))

            # Check that there's no other active locations for this dataset.
            active_dataset = _get_dataset_where_active(uri, indexed_datasets)
            if active_dataset:
                log.info("location.has_active",
                         active_dataset_id=active_dataset.id)
                continue

            # Are there any dataset ids in the file that we haven't indexed? Skip it.
            unindexed_ids = get_unknown_dataset_ids(index, uri)
            if unindexed_ids:
                log.info('location.has_unknown',
                         unknown_dataset_ids=unindexed_ids)
                continue

            was_trashed = paths.trash_uri(uri, dry_run=dry_run, log=log)
            if not dry_run:
                for dataset in indexed_datasets:
                    index.datasets.remove_location(dataset.id, uri)

            if was_trashed:
                trash_count += 1

            log = log.unbind('uri')
    return len(locations), trash_count
コード例 #5
0
ファイル: fixes.py プロジェクト: CEKrause/digitalearthau
def _trash_archived_dataset(mismatch: ArchivedDatasetOnDisk, index: Index,
                            min_age_hours: int):
    latest_archived_time = datetime.utcnow().replace(
        tzinfo=tz.tzutc()) - timedelta(hours=min_age_hours)

    # all datasets at location must have been archived to trash.
    for dataset in index.datasets.get_datasets_for_location(mismatch.uri):
        # Must be archived
        if dataset.archived_time is None:
            _LOG.warning("do_trash_archived.active_siblings",
                         dataset_id=mismatch.dataset.id)
            return
        # Archived more than min_age_hours ago
        if _as_utc(dataset.archived_time) > latest_archived_time:
            _LOG.info("do_trash_archived.too_young",
                      dataset_id=mismatch.dataset.id)
            return

    trash_uri(mismatch.uri)