Exemple #1
0
def garbage_collect_dataset__delayed(
    dataset_uuid: Optional[str] = None,
    store: StoreInput = None,
    chunk_size: int = 100,
    factory=None,
) -> List[Delayed]:
    """
    Remove auxiliary files that are no longer tracked by the dataset.

    These files include indices that are no longer referenced by the metadata
    as well as files in the directories of the tables that are no longer
    referenced. The latter is only applied to static datasets.

    Parameters
    ----------
    chunk_size
        Number of files that should be deleted in a single job.

    """

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid, store=store, factory=factory,
    )

    nested_files = dispatch_files_to_gc(
        dataset_uuid=None, store_factory=None, chunk_size=chunk_size, factory=ds_factory
    )
    return list(
        map_delayed(delete_files, nested_files, store_factory=ds_factory.store_factory)
    )
Exemple #2
0
def garbage_collect_dataset(dataset_uuid=None, store=None, factory=None):
    """
    Remove auxiliary files that are no longer tracked by the dataset.

    These files include indices that are no longer referenced by the metadata
    as well as files in the directories of the tables that are no longer
    referenced. The latter is only applied to static datasets.

    Parameters
    ----------
    """

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=False,
    )

    nested_files = dispatch_files_to_gc(dataset_uuid=None,
                                        store_factory=None,
                                        chunk_size=None,
                                        factory=ds_factory)

    # Given that `nested_files` is a generator with a single element, just
    # return the output of `delete_files` on that element.
    return delete_files(next(nested_files),
                        store_factory=ds_factory.store_factory)
Exemple #3
0
def garbage_collect_dataset__delayed(dataset_uuid=None,
                                     store=None,
                                     chunk_size=100,
                                     factory=None):
    """
    Remove auxiliary files that are no longer tracked by the dataset.

    These files include indices that are no longer referenced by the metadata
    as well as files in the directories of the tables that are no longer
    referenced. The latter is only applied to static datasets.

    Parameters
    ----------
    dataset_uuid: basestring
        The UUID of the dataset to be deleted
    store: callable
        A function returning a KeyValueStore.
    chunk_size: int
        Number of files that should be deleted in a single job.

    Returns
    -------
    tasks: list of dask.delayed
    """
    nested_files = dispatch_files_to_gc(
        dataset_uuid=dataset_uuid,
        store_factory=store,
        chunk_size=chunk_size,
        factory=factory,
    )
    return [
        delayed(delete_files)(files, store_factory=store)
        for files in nested_files
    ]
Exemple #4
0
def garbage_collect_dataset__delayed(dataset_uuid=None,
                                     store=None,
                                     chunk_size=100,
                                     factory=None):
    """
    Remove auxiliary files that are no longer tracked by the dataset.

    These files include indices that are no longer referenced by the metadata
    as well as files in the directories of the tables that are no longer
    referenced. The latter is only applied to static datasets.

    Parameters
    ----------
    chunk_size: int
        Number of files that should be deleted in a single job.

    Returns
    -------
    tasks: list of dask.delayed
    """

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=False,
    )

    nested_files = dispatch_files_to_gc(dataset_uuid=None,
                                        store_factory=None,
                                        chunk_size=chunk_size,
                                        factory=ds_factory)
    return list(
        map_delayed(delete_files,
                    nested_files,
                    store_factory=ds_factory.store_factory))