def main( storage_name, dataset_type=None, dataset_id=None, tag_name=None, all_file_instances=False, dry_run=False, fix_corrupt=False, remove_missing=False, ): logging.info('checking integrity of storage {}'.format(storage_name)) tantalus_api = TantalusApi() if all_file_instances: file_instances = tantalus_api.list('file_instance', storage__name=storage_name) else: file_instances = get_dataset_file_instances( tantalus_api, storage_name, dataset_type, dataset_id=dataset_id, tag_name=tag_name) for file_instance in file_instances: logging.info('checking file instance {} with path {}'.format( file_instance['id'], file_instance['filepath'])) if file_instance['is_deleted']: logging.info('file instance {} marked as deleted'.format( file_instance['id'])) continue file_corrupt = False file_missing = False try: tantalus_api.check_file(file_instance) except DataCorruptionError: file_corrupt = True logging.exception('check file failed') except DataMissingError: file_missing = True logging.exception('missing file') if file_corrupt and fix_corrupt: logging.info('updating file instance {} with path {}'.format( file_instance['id'], file_instance['filepath'])) if not dry_run: tantalus_api.update_file(file_instance) if file_missing and remove_missing: logging.info('deleting file instance {} with path {}'.format( file_instance['id'], file_instance['filepath'])) if not dry_run: file_instance = tantalus_api.update( 'file_instance', id=file_instance['id'], is_deleted=True, )
def main( storage_name, dataset_type, dataset_id=None, tag_name=None, check_remote=None, dry_run=False, ): logging.info('cleanup up storage {}'.format(storage_name)) if check_remote: logging.info('checking remote {}'.format(check_remote)) else: logging.warning('not checking remote') tantalus_api = TantalusApi() storage_client = tantalus_api.get_storage_client(storage_name) remote_client = None if check_remote is not None: remote_client = tantalus_api.get_storage_client(check_remote) if dataset_id is None and tag_name is None: raise ValueError('require either dataset id or tag name') if dataset_id is not None and tag_name is not None: raise ValueError('require exactly one of dataset id or tag name') if dataset_id is not None: logging.info('cleanup up dataset {}, {}'.format( dataset_id, dataset_type)) datasets = tantalus_api.list(dataset_type, id=dataset_id) if tag_name is not None: logging.info('cleanup up tag {}'.format(tag_name)) datasets = tantalus_api.list(dataset_type, tags__name=tag_name) total_data_size = 0 file_num_count = 0 for dataset in datasets: logging.info('checking dataset with id {}, name {}'.format( dataset['id'], dataset['name'])) # Optionally skip datasets not present and intact on the remote storage if check_remote is not None: if not tantalus_api.is_dataset_on_storage( dataset['id'], 'sequencedataset', check_remote): logging.warning( 'not deleting dataset with id {}, not on remote storage '. format(dataset['id'], check_remote)) continue # For each file instance on the remote, check if it exists and has the correct size in tantalus remote_file_size_check = True for file_instance in tantalus_api.get_dataset_file_instances( dataset['id'], dataset_type, check_remote): try: tantalus_api.check_file(file_instance) except DataError: logging.exception('check file failed') remote_file_size_check = False # Skip this dataset if any files failed if not remote_file_size_check: logging.warning( "skipping dataset {} that failed check on {}".format( dataset['id'], check_remote)) continue # Check consistency with the removal storage file_size_check = True for file_instance in tantalus_api.get_dataset_file_instances( dataset['id'], dataset_type, storage_name): try: tantalus_api.check_file(file_instance) except DataError: logging.exception('check file failed') file_size_check = False # Skip this dataset if any files failed if not file_size_check: logging.warning( "skipping dataset {} that failed check on {}".format( dataset['id'], storage_name)) continue # Delete all files for this dataset for file_instance in tantalus_api.get_dataset_file_instances( dataset['id'], dataset_type, storage_name): if dry_run: logging.info( "would delete file instance with id {}, filepath {}". format(file_instance['id'], file_instance['filepath'])) else: logging.info( "deleting file instance with id {}, filepath {}".format( file_instance['id'], file_instance['filepath'])) tantalus_api.update("file_instance", id=file_instance['id'], is_deleted=True) total_data_size += file_instance['file_resource']['size'] file_num_count += 1 logging.info("deleted a total of {} files with size {} bytes".format( file_num_count, total_data_size))