Ejemplo n.º 1
0
def process_cellenone_dataset(dataset,
                              storage_name,
                              tag_name=None,
                              update=False,
                              remote_storage_name=None):

    assert len(dataset['libraries']) == 1
    library_id = dataset['libraries'][0]['library_id']

    tantalus_api = TantalusApi()

    if not tantalus_api.is_dataset_on_storage(dataset['id'], 'resultsdataset',
                                              storage_name):
        raise ValueError(
            f"dataset {dataset['id']} not on storage {storage_name}")

    # Assume all files in the raw dataset are under the directory:
    #  single_cell_indexing/Cellenone/Cellenone_images/{date}_{library_id}

    filename_prefix = 'single_cell_indexing/Cellenone/Cellenone_images/'

    source_dir = None
    for file_resource in tantalus_api.get_dataset_file_resources(
            dataset['id'], 'resultsdataset'):
        if source_dir is None:
            if not file_resource['filename'].startswith(filename_prefix):
                raise ValueError(
                    f"file {file_resource['filename']} is not in directory {filename_prefix}"
                )

            library_subdir = file_resource['filename'].split('/')[3]

            if not library_subdir.endswith(library_id):
                raise ValueError(
                    f"file {file_resource['filename']} is not in a directory ending with {library_id}"
                )

            source_dir = '/'.join(file_resource['filename'].split('/')[:4])

        elif not file_resource['filename'].startswith(source_dir):
            raise ValueError(
                f"file {file_resource['filename']} is not in directory {source_dir}"
            )

    assert source_dir is not None

    source_dir = tantalus_api.get_filepath(storage_name, source_dir)

    process_cellenone_images(
        library_id,
        source_dir,
        storage_name,
        tag_name=tag_name,
        update=update,
        remote_storage_name=remote_storage_name,
    )
Ejemplo n.º 2
0
def main(
    storage_name,
    dataset_type,
    dataset_id=None,
    tag_name=None,
    check_remote=None,
    dry_run=False,
):
    logging.info('cleanup up storage {}'.format(storage_name))

    if check_remote:
        logging.info('checking remote {}'.format(check_remote))
    else:
        logging.warning('not checking remote')

    tantalus_api = TantalusApi()

    storage_client = tantalus_api.get_storage_client(storage_name)

    remote_client = None
    if check_remote is not None:
        remote_client = tantalus_api.get_storage_client(check_remote)

    if dataset_id is None and tag_name is None:
        raise ValueError('require either dataset id or tag name')

    if dataset_id is not None and tag_name is not None:
        raise ValueError('require exactly one of dataset id or tag name')

    if dataset_id is not None:
        logging.info('cleanup up dataset {}, {}'.format(
            dataset_id, dataset_type))
        datasets = tantalus_api.list(dataset_type, id=dataset_id)

    if tag_name is not None:
        logging.info('cleanup up tag {}'.format(tag_name))
        datasets = tantalus_api.list(dataset_type, tags__name=tag_name)

    total_data_size = 0
    file_num_count = 0

    for dataset in datasets:
        logging.info('checking dataset with id {}, name {}'.format(
            dataset['id'], dataset['name']))

        # Optionally skip datasets not present and intact on the remote storage
        if check_remote is not None:
            if not tantalus_api.is_dataset_on_storage(
                    dataset['id'], 'sequencedataset', check_remote):
                logging.warning(
                    'not deleting dataset with id {}, not on remote storage '.
                    format(dataset['id'], check_remote))
                continue

            # For each file instance on the remote, check if it exists and has the correct size in tantalus
            remote_file_size_check = True
            for file_instance in tantalus_api.get_dataset_file_instances(
                    dataset['id'], dataset_type, check_remote):
                try:
                    tantalus_api.check_file(file_instance)
                except DataError:
                    logging.exception('check file failed')
                    remote_file_size_check = False

            # Skip this dataset if any files failed
            if not remote_file_size_check:
                logging.warning(
                    "skipping dataset {} that failed check on {}".format(
                        dataset['id'], check_remote))
                continue

        # Check consistency with the removal storage
        file_size_check = True
        for file_instance in tantalus_api.get_dataset_file_instances(
                dataset['id'], dataset_type, storage_name):
            try:
                tantalus_api.check_file(file_instance)
            except DataError:
                logging.exception('check file failed')
                file_size_check = False

        # Skip this dataset if any files failed
        if not file_size_check:
            logging.warning(
                "skipping dataset {} that failed check on {}".format(
                    dataset['id'], storage_name))
            continue

        # Delete all files for this dataset
        for file_instance in tantalus_api.get_dataset_file_instances(
                dataset['id'], dataset_type, storage_name):
            if dry_run:
                logging.info(
                    "would delete file instance with id {}, filepath {}".
                    format(file_instance['id'], file_instance['filepath']))
            else:
                logging.info(
                    "deleting file instance with id {}, filepath {}".format(
                        file_instance['id'], file_instance['filepath']))
                tantalus_api.update("file_instance",
                                    id=file_instance['id'],
                                    is_deleted=True)
            total_data_size += file_instance['file_resource']['size']
            file_num_count += 1

    logging.info("deleted a total of {} files with size {} bytes".format(
        file_num_count, total_data_size))