def main( storage_name, dry_run=False, check_remote=None, ): tantalus_api = TantalusApi() storage_client = tantalus_api.get_storage_client(storage_name) remote_storage_client = None if check_remote is not None: remote_storage_client = tantalus_api.get_storage_client(check_remote) file_instances = tantalus_api.list('file_instance', storage__name=storage_name, is_deleted=True) # DEBUG: check whether we are getting back # consistent ordered results from tantalus file_instances = list(file_instances) file_instance_ids = set([f['id'] for f in file_instances]) if len(file_instances) != len(file_instance_ids): raise Exception('received duplicate results from tantalus') logging.info('processing {} file instances'.format(len(file_instance_ids))) logging.info('processing the following file instances: {}'.format( str(file_instance_ids))) for file_instance in file_instances: file_resource = tantalus_api.get( 'file_resource', id=file_instance['file_resource']['id']) all_file_instances = list( tantalus_api.list('file_instance', file_resource=file_resource['id'])) logging.info( 'checking file instance {}, file resource {}, filepath {}'.format( file_instance['id'], file_resource['id'], file_instance['filepath'])) sequencedatasets = tantalus_api.list( 'sequencedataset', file_resources__id=file_resource['id']) resultsdatasets = tantalus_api.list( 'resultsdataset', file_resources__id=file_resource['id']) sequencedataset_ids = list(set([a['id'] for a in sequencedatasets])) resultsdataset_ids = list(set([a['id'] for a in resultsdatasets])) logging.info( 'file resource {} belongs to sequencedataset {} and resultsdatasets {}' .format(file_resource['id'], sequencedataset_ids, resultsdataset_ids)) # Optionally check for a remote version if remote_storage_client: remote_instance = None for other_instance in file_resource['file_instances']: if other_instance['storage']['name'] == check_remote: remote_instance = other_instance if not remote_instance: logging.info( 'not deleting file instance {}, no other instance'.format( file_instance['id'])) continue if remote_instance['is_deleted']: logging.info( 'not deleting file instance {}, other instance {} deleted'. format(file_instance['id'], other_instance['id'])) continue if not remote_storage_client.exists(file_resource['filename']): logging.info( 'not deleting file instance {}, other instance {} doesnt exist' .format(file_instance['id'], other_instance['id'])) continue logging.info( 'deletion ok for file instance {}, found other instance {}'. format(file_instance['id'], other_instance['id'])) # Delete the file from the filesystem logging.info('deleting file {}'.format(file_instance['filepath'])) if not dry_run: try: storage_client.delete(file_resource['filename']) except FileNotFoundError: logging.exception('file already deleted') # Delete the instance model from tantalus logging.info('deleting file instance {}'.format(file_instance['id'])) if not dry_run: tantalus_api.delete('file_instance', id=file_instance['id']) # If this is the only file instance for this file resource, delete the file resource if len(all_file_instances) == 1: assert all_file_instances[0]['id'] == file_instance['id'] logging.info('deleting file resource {}'.format( file_resource['id'])) if not dry_run: tantalus_api.delete('file_resource', id=file_resource['id'])
'sequencing_library_id'): if correct_lane[field] != incorrect_lane[field]: logging.warning('updating {} from {} to {}'.format( field, correct_lane[field], incorrect_lane[field])) correct_lane = tantalus_api.update( 'sequencing_lane', id=correct_lane['id'], **dict(field=incorrect_lane[field])) datasets = list( tantalus_api.list( 'sequence_dataset', library__library_id=row['library_id'], )) for dataset in datasets: lane_pks = [l['id'] for l in dataset['sequence_lanes']] if incorrect_lane['id'] not in lane_pks: continue num_lanes = len(lane_pks) lane_pks.remove(incorrect_lane['id']) lane_pks.append(correct_lane['id']) assert num_lanes == len(lane_pks) tantalus_api.update( 'sequence_dataset', id=dataset['id'], sequence_lanes=lane_pks, ) tantalus_api.delete('sequencing_lane', id=incorrect_lane['id'])