def main( storage_name, dataset_type=None, dataset_id=None, tag_name=None, all_file_instances=False, dry_run=False, fix_corrupt=False, remove_missing=False, ): logging.info('checking integrity of storage {}'.format(storage_name)) tantalus_api = TantalusApi() if all_file_instances: file_instances = tantalus_api.list('file_instance', storage__name=storage_name) else: file_instances = get_dataset_file_instances( tantalus_api, storage_name, dataset_type, dataset_id=dataset_id, tag_name=tag_name) for file_instance in file_instances: logging.info('checking file instance {} with path {}'.format( file_instance['id'], file_instance['filepath'])) if file_instance['is_deleted']: logging.info('file instance {} marked as deleted'.format( file_instance['id'])) continue file_corrupt = False file_missing = False try: tantalus_api.check_file(file_instance) except DataCorruptionError: file_corrupt = True logging.exception('check file failed') except DataMissingError: file_missing = True logging.exception('missing file') if file_corrupt and fix_corrupt: logging.info('updating file instance {} with path {}'.format( file_instance['id'], file_instance['filepath'])) if not dry_run: tantalus_api.update_file(file_instance) if file_missing and remove_missing: logging.info('deleting file instance {} with path {}'.format( file_instance['id'], file_instance['filepath'])) if not dry_run: file_instance = tantalus_api.update( 'file_instance', id=file_instance['id'], is_deleted=True, )
def fix_bams(jira_ticket=None, dry_run=False): tantalus_api = TantalusApi() analyses_list = [] storage_name = "singlecellresults" if jira_ticket is not None: analyses_list.append(tantalus_api.get('analysis', jira_ticket=jira_ticket, analysis_type__name="align", status="complete")) else: # Get all completed align analyses ran with specific version # the bams associated to these analyses are in the wrong storage account for version in ('v0.5.2', 'v0.5.3'): analyses = tantalus_api.list('analysis', analysis_type__name="align", status="complete", version=version) analyses_list += [a for a in analyses] for analysis in analyses_list: jira_ticket = analysis["jira_ticket"] filename = f'{jira_ticket}/results/bams/metadata.yaml' logging.info(f'adding file {filename}') if not dry_run: file_instance, file_resource = tantalus_api.add_file(storage_name, filename) # get all bam datasets associated with the jira ticket bam_datasets = tantalus_api.list( "sequencedataset", dataset_type="BAM", analysis__jira_ticket=jira_ticket, ) for dataset in bam_datasets: dataset_id = dataset['id'] logging.info(f'adding file to dataset {dataset_id}') if not dry_run: file_resource_ids = dataset['file_resources'] file_resource_ids = file_resource_ids.append(file_resource['id']) tantalus_api.update('sequencedataset', id=dataset['id'], file_resources=file_resource_ids)
def create_fastq_metadata_yaml(library_id, storage_name, dry_run=False): """ Create a metadata.yaml file for a all FQ datasets for a library id. """ tantalus_api = TantalusApi() storage = tantalus_api.get_storage(storage_name) client = tantalus_api.get_storage_client(storage_name) for dataset_info, metadata in create_lane_fastq_metadata( tantalus_api, library_id): metadata_filename = os.path.join(dataset_info['base_dir'], 'metadata.yaml') metadata_filepath = tantalus_api.get_filepath(storage_name, metadata_filename) metadata_io = io.BytesIO() metadata_io.write( yaml.dump(metadata, default_flow_style=False).encode()) logging.info(f'writing metadata to file {metadata_filepath}') client.write_data(metadata_filename, metadata_io) logging.info(f'adding {metadata_filepath} to tantalus') if not dry_run: file_resource, file_instance = tantalus_api.add_file( storage_name, metadata_filepath, update=True) for dataset_id in dataset_info['dataset_ids']: dataset = tantalus_api.get('sequencedataset', id=dataset_id) new_file_resources = set(dataset['file_resources']) new_file_resources.add(file_resource['id']) tantalus_api.update('sequencedataset', id=dataset_id, file_resources=list(new_file_resources))
def add_fastq_metadata_yaml(dataset_id, storage_name, dry_run=False): """ Create a metadata.yaml file for a dataset and add to tantalus. """ tantalus_api = TantalusApi() storage = tantalus_api.get_storage(storage_name) client = tantalus_api.get_storage_client(storage_name) metadata, base_dir = create_lane_fastq_metadata(tantalus_api, dataset_id) metadata_filename = os.path.join(base_dir, 'metadata.yaml') metadata_filepath = tantalus_api.get_filepath(storage_name, metadata_filename) metadata_io = io.BytesIO() metadata_io.write(yaml.dump(metadata, default_flow_style=False).encode()) print(f'writing metadata to file {metadata_filepath}') client.write_data(metadata_filename, metadata_io) print(f'adding {metadata_filepath} to tantalus') if not dry_run: file_resource, file_instance = tantalus_api.add_file(storage_name, metadata_filepath, update=True) dataset = tantalus_api.get('sequencedataset', id=dataset_id) new_file_resources = set(dataset['file_resources']) new_file_resources.add(file_resource['id']) tantalus_api.update('sequencedataset', id=dataset_id, file_resources=list(new_file_resources))
def fix_bams(jira_ticket=None, dry_run=False): logging.info(f'dry run: {dry_run}') tantalus_api = TantalusApi() SC_WGS_BAM_DIR_TEMPLATE = os.path.join( 'single_cell_indexing', 'bam', '{library_id}', '{ref_genome}', '{aligner_name}', 'numlanes_{number_lanes}', '{jira_ticket}', ) reference_genome_map = { 'HG19': 'grch37', 'MM10': 'mm10', } analyses_list = [] from_storage_name = "singlecellresults" to_storage_name = "singlecellblob" from_storage_client = tantalus_api.get_storage_client(from_storage_name) to_storage_client = tantalus_api.get_storage_client(to_storage_name) to_storage_id = tantalus_api.get('storage', name=to_storage_name)['id'] if jira_ticket is not None: analyses_list.append( tantalus_api.get('analysis', jira_ticket=jira_ticket, analysis_type__name="align", status="complete")) else: # Get all completed align analyses ran with specific version # the bams associated to these analyses are in the wrong storage account for version in ('v0.5.2', 'v0.5.3', 'v0.5.4'): analyses = tantalus_api.list('analysis', analysis_type__name="align", status="complete", version=version) analyses_list += [a for a in analyses] for analysis in analyses_list: jira_ticket = analysis["jira_ticket"] print(f"moving bams for {jira_ticket}") # get all bam datasets associated with the jira ticket bam_datasets = tantalus_api.list( "sequencedataset", dataset_type="BAM", analysis__jira_ticket=jira_ticket, ) for dataset in bam_datasets: # Get number of lanes from dataset for use with filepath lanes = set() for sequence_lane in dataset['sequence_lanes']: lane = "{}_{}".format(sequence_lane['flowcell_id'], sequence_lane['lane_number']) lanes.add(lane) number_lanes = len(lanes) try: file_instances = tantalus_api.get_dataset_file_instances( dataset["id"], "sequencedataset", from_storage_name, ) except dbclients.tantalus.DataNotOnStorageError: logging.info( f'dataset {dataset["id"]} not on {from_storage_name}, skipping' ) continue for file_instance in file_instances: blobname = file_instance["file_resource"]["filename"] # get url of source blob blob_url = from_storage_client.get_url(blobname) bam_filename = blobname.split("/bams/")[1] new_blobname = os.path.join( SC_WGS_BAM_DIR_TEMPLATE.format( library_id=dataset["library"]["library_id"], ref_genome=reference_genome_map[ dataset["reference_genome"]], aligner_name=dataset["aligner"], number_lanes=number_lanes, jira_ticket=jira_ticket, ), bam_filename, ) # copy blob to desired storage account with new blobname blob_filepath = f"{to_storage_client.prefix}/{new_blobname}" logging.info( f'copying {new_blobname} to storage {to_storage_name} from {blob_url} to {blob_filepath}' ) if not dry_run: to_storage_client.blob_service.copy_blob( container_name="data", blob_name=new_blobname, copy_source=blob_url, ) file_resource_id = file_instance['file_resource']['id'] file_instance_id = file_instance['id'] logging.info( f'updating file resource {file_resource_id} to have filename {new_blobname}' ) if not dry_run: tantalus_api.update('file_resource', id=file_resource_id, filename=new_blobname) logging.info( f'updating file instance {file_instance_id} to have storage with id {to_storage_id}' ) if not dry_run: tantalus_api.update('file_instance', id=file_instance_id, storage=to_storage_id)
def run_h5_convert(cache_dir, dataset_id=None, results_type=None, redo=False, dry_run=False, check_done=False): tantalus_api = TantalusApi() local_cache_client = tantalus_api.get_cache_client(cache_dir) remote_storage_client = tantalus_api.get_storage_client( remote_storage_name) if dataset_id is not None: results_list = [tantalus_api.get("resultsdataset", id=dataset_id)] logging.info('converting results with id {}'.format(dataset_id)) elif results_type is not None: results_list = tantalus_api.list("resultsdataset", results_type=results_type) logging.info( 'converting results with results type {}'.format(results_type)) else: results_list = tantalus_api.list("resultsdataset") logging.info('converting all results') for result in results_list: logging.info('processing results dataset {}'.format(result['id'])) try: file_instances = tantalus_api.get_dataset_file_instances( result["id"], "resultsdataset", remote_storage_name, ) existing_filenames = set( [i['file_resource']['filename'] for i in file_instances]) found_csv_yaml = False for existing_filename in existing_filenames: # Destruct outputs csv.yaml directly, check non destruct files if 'destruct' in existing_filename: continue if existing_filename.endswith('.csv.gz.yaml'): found_csv_yaml = True break if found_csv_yaml and check_done: logging.info('found filename {}, skipping conversion'.format( existing_filename)) continue file_resource_ids = [] filepaths_to_clean = [] for file_instance in file_instances: if not file_instance['file_resource']['filename'].endswith( '.h5'): continue datamanagement.transfer_files.cache_file( tantalus_api, file_instance, cache_dir) h5_filepath = local_cache_client.get_url( file_instance['file_resource']['filename']) filepaths_to_clean.append(h5_filepath) logging.info('converting {}'.format(h5_filepath)) for key, csv_filepath in get_h5_csv_info(h5_filepath): if not csv_filepath.startswith(cache_dir): raise Exception( 'unexpected csv path {}'.format(csv_filepath)) csv_filename = csv_filepath[len(cache_dir):] csv_filename = csv_filename.lstrip('/') if csv_filename in existing_filenames and not redo: logging.info( 'file {} already exists, not converting'.format( csv_filename)) continue if dry_run: logging.info('would convert {}, key {} to {}'.format( h5_filepath, key, csv_filepath)) continue logging.info('converting {}, key {} to {}'.format( h5_filepath, key, csv_filepath)) convert_h5(h5_filepath, key, csv_filepath) yaml_filename = csv_filename + '.yaml' yaml_filepath = csv_filepath + '.yaml' fileinfo_to_add = [ (csv_filename, csv_filepath), (yaml_filename, yaml_filepath), ] for filename, filepath in fileinfo_to_add: logging.info('creating file {} from path {}'.format( filename, filepath)) remote_storage_client.create(filename, filepath, update=redo) remote_filepath = os.path.join( remote_storage_client.prefix, filename) logging.info('adding file {} from path {}'.format( filename, remote_filepath)) (file_resource, file_instance) = tantalus_api.add_file( remote_storage_name, remote_filepath, update=True) #redo) file_resource_ids.append(file_resource["id"]) filepaths_to_clean.append(filepath) if len(file_resource_ids) == 0: logging.warning('no files added') continue logging.info('adding file resources {} to dataset {}'.format( file_resource_ids, result["id"])) tantalus_api.update( "resultsdataset", result["id"], file_resources=result["file_resources"] + file_resource_ids, ) for filepath in filepaths_to_clean: logging.info('removing file {}'.format(filepath)) os.remove(filepath) except NotFoundError: logging.exception('no files found for conversion') except KeyboardInterrupt: raise except Exception: logging.exception('conversion failed')
def add_generic_results(filepaths, storage_name, results_name, results_type, results_version, sample_ids=(), library_ids=(), analysis_pk=None, recursive=False, tag_name=None, update=False, remote_storage_name=None): tantalus_api = TantalusApi() sample_pks = [] for sample_id in sample_ids: samples = tantalus_api.get( "sample", sample_id=sample_id, ) sample_pks.append(samples['id']) library_pks = [] for library_id in library_ids: librarys = tantalus_api.get( "dna_library", library_id=library_id, ) library_pks.append(librarys['id']) #Add the file resource to tantalus file_resource_pks = [] for filepath in filepaths: if recursive: logging.info("Recursing directory {}".format(filepath)) add_filepaths = [] for (dirpath, dirnames, filenames) in os.walk(filepath): for filename in filenames: add_filepaths.append(os.path.join(dirpath, filename)) else: add_filepaths = [filepath] for add_filepath in add_filepaths: logging.info( "Adding file resource for {} to Tantalus".format(add_filepath)) resource, instance = tantalus_api.add_file( storage_name=storage_name, filepath=add_filepath, update=update, ) file_resource_pks.append(resource["id"]) results_dataset_fields = dict( name=results_name, results_type=results_type, results_version=results_version, analysis=analysis_pk, samples=sample_pks, libraries=library_pks, file_resources=file_resource_pks, ) #Add the dataset to tantalus try: results_id = tantalus_api.get( "results", name=results_dataset_fields["name"])["id"] except NotFoundError: results_id = None if update and results_id is not None: logging.warning("results dataset {} exists, updating".format( results_dataset_fields["name"])) results_dataset = tantalus_api.update("results", id=results_id, **results_dataset_fields) else: logging.info("creating results dataset {}".format( results_dataset_fields["name"])) results_dataset = tantalus_api.get_or_create("results", **results_dataset_fields) if tag_name is not None: tantalus_api.tag(tag_name, resultsdataset_set=[results_id]) logging.info("Succesfully created sequence dataset with ID {}".format( results_dataset["id"])) if remote_storage_name is not None: transfer_files.transfer_dataset(tantalus_api, results_dataset['id'], "resultsdataset", storage_name, remote_storage_name) return results_dataset
dna_library=incorrect_lane['dna_library'], sequencing_centre="GSC", sequencing_instrument=incorrect_lane['sequencing_instrument'], sequencing_library_id=incorrect_lane['sequencing_library_id'], read_type=incorrect_lane['read_type'], ) pass else: for field in ('sequencing_centre', 'sequencing_instrument', 'sequencing_library_id'): if correct_lane[field] != incorrect_lane[field]: logging.warning('updating {} from {} to {}'.format( field, correct_lane[field], incorrect_lane[field])) correct_lane = tantalus_api.update( 'sequencing_lane', id=correct_lane['id'], **dict(field=incorrect_lane[field])) datasets = list( tantalus_api.list( 'sequence_dataset', library__library_id=row['library_id'], )) for dataset in datasets: lane_pks = [l['id'] for l in dataset['sequence_lanes']] if incorrect_lane['id'] not in lane_pks: continue num_lanes = len(lane_pks) lane_pks.remove(incorrect_lane['id']) lane_pks.append(correct_lane['id'])
def fix(): tantalus_api = TantalusApi() datasets = list( tantalus_api.list( 'sequence_dataset', dataset_type='BAM', library__library_type__name='WGS', )) for dataset in datasets: bams = {} bais = {} specs = {} for file_resource_id in dataset['file_resources']: file_resource = tantalus_api.get('file_resource', id=file_resource_id) if file_resource['filename'].endswith('.bam'): bams[file_resource_id] = file_resource['filename'] elif file_resource['filename'].endswith('.spec'): specs[file_resource_id] = file_resource['filename'] elif file_resource['filename'].endswith('.bam.bai'): bais[file_resource_id] = file_resource['filename'] if len(bams) == 0 and len(specs) == 0: print(dataset['id']) elif len(bams) > 1: logging.info(f"fixing {dataset['name']}, {bams}") to_remove_bam_id = max(bams.keys()) to_remove_bai_id = None for id_, bai in bais.items(): if bai.startswith(bams[to_remove_bam_id]): assert to_remove_bai_id is None to_remove_bai_id = id_ break assert to_remove_bai_id is not None logging.info((to_remove_bam_id, bams[to_remove_bam_id], to_remove_bai_id, bais[to_remove_bai_id])) new_file_resources = dataset['file_resources'] new_file_resources.remove(to_remove_bam_id) new_file_resources.remove(to_remove_bai_id) logging.info( f"updating {dataset['id']} to have files {new_file_resources}") tantalus_api.update('sequencedataset', id=dataset['id'], file_resources=new_file_resources) assert dataset["name"].endswith(str(dataset["version_number"])) similar_datasets = list( tantalus_api.list( "sequence_dataset", name=dataset["name"], )) new_version_number = max(d['version_number'] for d in similar_datasets) + 1 new_dataset_params = dict( sample=dataset['sample']['id'], library=dataset['library']['id'], sequence_lanes=[l['id'] for l in dataset['sequence_lanes']], aligner=dataset['aligner'], reference_genome=dataset['reference_genome'], name=dataset['name'][:-1] + str(new_version_number), dataset_type=dataset['dataset_type'], version_number=new_version_number, file_resources=[to_remove_bam_id, to_remove_bai_id], ) logging.info(new_dataset_params) new_dataset = tantalus_api.create('sequencedataset', **new_dataset_params) logging.info(new_dataset)
def main( storage_name, dataset_type, dataset_id=None, tag_name=None, check_remote=None, dry_run=False, ): logging.info('cleanup up storage {}'.format(storage_name)) if check_remote: logging.info('checking remote {}'.format(check_remote)) else: logging.warning('not checking remote') tantalus_api = TantalusApi() storage_client = tantalus_api.get_storage_client(storage_name) remote_client = None if check_remote is not None: remote_client = tantalus_api.get_storage_client(check_remote) if dataset_id is None and tag_name is None: raise ValueError('require either dataset id or tag name') if dataset_id is not None and tag_name is not None: raise ValueError('require exactly one of dataset id or tag name') if dataset_id is not None: logging.info('cleanup up dataset {}, {}'.format( dataset_id, dataset_type)) datasets = tantalus_api.list(dataset_type, id=dataset_id) if tag_name is not None: logging.info('cleanup up tag {}'.format(tag_name)) datasets = tantalus_api.list(dataset_type, tags__name=tag_name) total_data_size = 0 file_num_count = 0 for dataset in datasets: logging.info('checking dataset with id {}, name {}'.format( dataset['id'], dataset['name'])) # Optionally skip datasets not present and intact on the remote storage if check_remote is not None: if not tantalus_api.is_dataset_on_storage( dataset['id'], 'sequencedataset', check_remote): logging.warning( 'not deleting dataset with id {}, not on remote storage '. format(dataset['id'], check_remote)) continue # For each file instance on the remote, check if it exists and has the correct size in tantalus remote_file_size_check = True for file_instance in tantalus_api.get_dataset_file_instances( dataset['id'], dataset_type, check_remote): try: tantalus_api.check_file(file_instance) except DataError: logging.exception('check file failed') remote_file_size_check = False # Skip this dataset if any files failed if not remote_file_size_check: logging.warning( "skipping dataset {} that failed check on {}".format( dataset['id'], check_remote)) continue # Check consistency with the removal storage file_size_check = True for file_instance in tantalus_api.get_dataset_file_instances( dataset['id'], dataset_type, storage_name): try: tantalus_api.check_file(file_instance) except DataError: logging.exception('check file failed') file_size_check = False # Skip this dataset if any files failed if not file_size_check: logging.warning( "skipping dataset {} that failed check on {}".format( dataset['id'], storage_name)) continue # Delete all files for this dataset for file_instance in tantalus_api.get_dataset_file_instances( dataset['id'], dataset_type, storage_name): if dry_run: logging.info( "would delete file instance with id {}, filepath {}". format(file_instance['id'], file_instance['filepath'])) else: logging.info( "deleting file instance with id {}, filepath {}".format( file_instance['id'], file_instance['filepath'])) tantalus_api.update("file_instance", id=file_instance['id'], is_deleted=True) total_data_size += file_instance['file_resource']['size'] file_num_count += 1 logging.info("deleted a total of {} files with size {} bytes".format( file_num_count, total_data_size))