def run_h5_convert(results_type=None): tantalus_api = TantalusApi() remote_storage_client = tantalus_api.get_storage_client( remote_storage_name) if results_type is not None: results_list = tantalus_api.list("resultsdataset", results_type=results_type) logging.info( 'converting results with results type {}'.format(results_type)) else: results_list = tantalus_api.list("resultsdataset") logging.info('converting all results') for result in results_list: logging.info('processing results dataset {}'.format(result['id'])) try: file_instances = tantalus_api.get_dataset_file_instances( result["id"], "resultsdataset", remote_storage_name, ) existing_filenames = set( [i['file_resource']['filename'] for i in file_instances]) found_csv_yaml = False for existing_filename in existing_filenames: if existing_filename.endswith('.csv.gz.yaml'): found_csv_yaml = True break if found_csv_yaml: logging.info('found filename {}, skipping conversion'.format( existing_filename)) else: print(result["id"]) logging.info('no yaml found') except NotFoundError: logging.exception('no files found for conversion') except KeyboardInterrupt: raise except Exception: logging.exception('conversion failed')
def fix_bams(jira_ticket=None, dry_run=False): logging.info(f'dry run: {dry_run}') tantalus_api = TantalusApi() SC_WGS_BAM_DIR_TEMPLATE = os.path.join( 'single_cell_indexing', 'bam', '{library_id}', '{ref_genome}', '{aligner_name}', 'numlanes_{number_lanes}', '{jira_ticket}', ) reference_genome_map = { 'HG19': 'grch37', 'MM10': 'mm10', } analyses_list = [] from_storage_name = "singlecellresults" to_storage_name = "singlecellblob" from_storage_client = tantalus_api.get_storage_client(from_storage_name) to_storage_client = tantalus_api.get_storage_client(to_storage_name) to_storage_id = tantalus_api.get('storage', name=to_storage_name)['id'] if jira_ticket is not None: analyses_list.append( tantalus_api.get('analysis', jira_ticket=jira_ticket, analysis_type__name="align", status="complete")) else: # Get all completed align analyses ran with specific version # the bams associated to these analyses are in the wrong storage account for version in ('v0.5.2', 'v0.5.3', 'v0.5.4'): analyses = tantalus_api.list('analysis', analysis_type__name="align", status="complete", version=version) analyses_list += [a for a in analyses] for analysis in analyses_list: jira_ticket = analysis["jira_ticket"] print(f"moving bams for {jira_ticket}") # get all bam datasets associated with the jira ticket bam_datasets = tantalus_api.list( "sequencedataset", dataset_type="BAM", analysis__jira_ticket=jira_ticket, ) for dataset in bam_datasets: # Get number of lanes from dataset for use with filepath lanes = set() for sequence_lane in dataset['sequence_lanes']: lane = "{}_{}".format(sequence_lane['flowcell_id'], sequence_lane['lane_number']) lanes.add(lane) number_lanes = len(lanes) try: file_instances = tantalus_api.get_dataset_file_instances( dataset["id"], "sequencedataset", from_storage_name, ) except dbclients.tantalus.DataNotOnStorageError: logging.info( f'dataset {dataset["id"]} not on {from_storage_name}, skipping' ) continue for file_instance in file_instances: blobname = file_instance["file_resource"]["filename"] # get url of source blob blob_url = from_storage_client.get_url(blobname) bam_filename = blobname.split("/bams/")[1] new_blobname = os.path.join( SC_WGS_BAM_DIR_TEMPLATE.format( library_id=dataset["library"]["library_id"], ref_genome=reference_genome_map[ dataset["reference_genome"]], aligner_name=dataset["aligner"], number_lanes=number_lanes, jira_ticket=jira_ticket, ), bam_filename, ) # copy blob to desired storage account with new blobname blob_filepath = f"{to_storage_client.prefix}/{new_blobname}" logging.info( f'copying {new_blobname} to storage {to_storage_name} from {blob_url} to {blob_filepath}' ) if not dry_run: to_storage_client.blob_service.copy_blob( container_name="data", blob_name=new_blobname, copy_source=blob_url, ) file_resource_id = file_instance['file_resource']['id'] file_instance_id = file_instance['id'] logging.info( f'updating file resource {file_resource_id} to have filename {new_blobname}' ) if not dry_run: tantalus_api.update('file_resource', id=file_resource_id, filename=new_blobname) logging.info( f'updating file instance {file_instance_id} to have storage with id {to_storage_id}' ) if not dry_run: tantalus_api.update('file_instance', id=file_instance_id, storage=to_storage_id)
def rename_fastqs(dataset_id, storage_name, dry_run=False, check_only=False): logging.info(f'dataset: {dataset_id}') logging.info(f'dry run: {dry_run}') tantalus_api = TantalusApi() storage_client = tantalus_api.get_storage_client(storage_name) dataset = tantalus_api.get('sequencedataset', id=dataset_id) file_instances = tantalus_api.get_dataset_file_instances( dataset['id'], 'sequencedataset', storage_name, ) for file_instance in file_instances: filename = file_instance['file_resource']['filename'] if os.path.basename(filename) == 'metadata.yaml': continue assert len(dataset['sequence_lanes']) == 1 parts = filename.split('/') basename = os.path.basename(filename) non_conforming = False try: assert parts[0] == 'single_cell_indexing' assert parts[1] == 'fastq' assert parts[2] == dataset['library']['library_id'] assert parts[3].split( '_')[0] == dataset['sequence_lanes'][0]['flowcell_id'] assert parts[3].split( '_')[1] == dataset['sequence_lanes'][0]['lane_number'] assert parts[4] == dataset['sample']['sample_id'] except AssertionError: non_conforming = True if check_only: if non_conforming: raise Exception(f'filename {filename} does not conform') continue new_filename = SC_WGS_FQ_TEMPLATE.format( dlp_library_id=dataset['library']['library_id'], flowcell_id=dataset['sequence_lanes'][0]['flowcell_id'], lane_number=dataset['sequence_lanes'][0]['lane_number'], cell_sample_id=dataset['sample']['sample_id'], cell_filename=basename, ) if new_filename == filename: logging.info(f'skipping conforming {filename} on {storage_name}') continue logging.info( f'renaming {filename} to {new_filename} on {storage_name}') if not dry_run: if not storage_client.exists(new_filename): storage_client.copy(filename, new_filename, wait=True) tantalus_api.swap_file(file_instance, new_filename) storage_client.delete(filename)
def run_h5_convert(cache_dir, dataset_id=None, results_type=None, redo=False, dry_run=False, check_done=False): tantalus_api = TantalusApi() local_cache_client = tantalus_api.get_cache_client(cache_dir) remote_storage_client = tantalus_api.get_storage_client( remote_storage_name) if dataset_id is not None: results_list = [tantalus_api.get("resultsdataset", id=dataset_id)] logging.info('converting results with id {}'.format(dataset_id)) elif results_type is not None: results_list = tantalus_api.list("resultsdataset", results_type=results_type) logging.info( 'converting results with results type {}'.format(results_type)) else: results_list = tantalus_api.list("resultsdataset") logging.info('converting all results') for result in results_list: logging.info('processing results dataset {}'.format(result['id'])) try: file_instances = tantalus_api.get_dataset_file_instances( result["id"], "resultsdataset", remote_storage_name, ) existing_filenames = set( [i['file_resource']['filename'] for i in file_instances]) found_csv_yaml = False for existing_filename in existing_filenames: # Destruct outputs csv.yaml directly, check non destruct files if 'destruct' in existing_filename: continue if existing_filename.endswith('.csv.gz.yaml'): found_csv_yaml = True break if found_csv_yaml and check_done: logging.info('found filename {}, skipping conversion'.format( existing_filename)) continue file_resource_ids = [] filepaths_to_clean = [] for file_instance in file_instances: if not file_instance['file_resource']['filename'].endswith( '.h5'): continue datamanagement.transfer_files.cache_file( tantalus_api, file_instance, cache_dir) h5_filepath = local_cache_client.get_url( file_instance['file_resource']['filename']) filepaths_to_clean.append(h5_filepath) logging.info('converting {}'.format(h5_filepath)) for key, csv_filepath in get_h5_csv_info(h5_filepath): if not csv_filepath.startswith(cache_dir): raise Exception( 'unexpected csv path {}'.format(csv_filepath)) csv_filename = csv_filepath[len(cache_dir):] csv_filename = csv_filename.lstrip('/') if csv_filename in existing_filenames and not redo: logging.info( 'file {} already exists, not converting'.format( csv_filename)) continue if dry_run: logging.info('would convert {}, key {} to {}'.format( h5_filepath, key, csv_filepath)) continue logging.info('converting {}, key {} to {}'.format( h5_filepath, key, csv_filepath)) convert_h5(h5_filepath, key, csv_filepath) yaml_filename = csv_filename + '.yaml' yaml_filepath = csv_filepath + '.yaml' fileinfo_to_add = [ (csv_filename, csv_filepath), (yaml_filename, yaml_filepath), ] for filename, filepath in fileinfo_to_add: logging.info('creating file {} from path {}'.format( filename, filepath)) remote_storage_client.create(filename, filepath, update=redo) remote_filepath = os.path.join( remote_storage_client.prefix, filename) logging.info('adding file {} from path {}'.format( filename, remote_filepath)) (file_resource, file_instance) = tantalus_api.add_file( remote_storage_name, remote_filepath, update=True) #redo) file_resource_ids.append(file_resource["id"]) filepaths_to_clean.append(filepath) if len(file_resource_ids) == 0: logging.warning('no files added') continue logging.info('adding file resources {} to dataset {}'.format( file_resource_ids, result["id"])) tantalus_api.update( "resultsdataset", result["id"], file_resources=result["file_resources"] + file_resource_ids, ) for filepath in filepaths_to_clean: logging.info('removing file {}'.format(filepath)) os.remove(filepath) except NotFoundError: logging.exception('no files found for conversion') except KeyboardInterrupt: raise except Exception: logging.exception('conversion failed')
def main( storage_name, dataset_type, dataset_id=None, tag_name=None, check_remote=None, dry_run=False, ): logging.info('cleanup up storage {}'.format(storage_name)) if check_remote: logging.info('checking remote {}'.format(check_remote)) else: logging.warning('not checking remote') tantalus_api = TantalusApi() storage_client = tantalus_api.get_storage_client(storage_name) remote_client = None if check_remote is not None: remote_client = tantalus_api.get_storage_client(check_remote) if dataset_id is None and tag_name is None: raise ValueError('require either dataset id or tag name') if dataset_id is not None and tag_name is not None: raise ValueError('require exactly one of dataset id or tag name') if dataset_id is not None: logging.info('cleanup up dataset {}, {}'.format( dataset_id, dataset_type)) datasets = tantalus_api.list(dataset_type, id=dataset_id) if tag_name is not None: logging.info('cleanup up tag {}'.format(tag_name)) datasets = tantalus_api.list(dataset_type, tags__name=tag_name) total_data_size = 0 file_num_count = 0 for dataset in datasets: logging.info('checking dataset with id {}, name {}'.format( dataset['id'], dataset['name'])) # Optionally skip datasets not present and intact on the remote storage if check_remote is not None: if not tantalus_api.is_dataset_on_storage( dataset['id'], 'sequencedataset', check_remote): logging.warning( 'not deleting dataset with id {}, not on remote storage '. format(dataset['id'], check_remote)) continue # For each file instance on the remote, check if it exists and has the correct size in tantalus remote_file_size_check = True for file_instance in tantalus_api.get_dataset_file_instances( dataset['id'], dataset_type, check_remote): try: tantalus_api.check_file(file_instance) except DataError: logging.exception('check file failed') remote_file_size_check = False # Skip this dataset if any files failed if not remote_file_size_check: logging.warning( "skipping dataset {} that failed check on {}".format( dataset['id'], check_remote)) continue # Check consistency with the removal storage file_size_check = True for file_instance in tantalus_api.get_dataset_file_instances( dataset['id'], dataset_type, storage_name): try: tantalus_api.check_file(file_instance) except DataError: logging.exception('check file failed') file_size_check = False # Skip this dataset if any files failed if not file_size_check: logging.warning( "skipping dataset {} that failed check on {}".format( dataset['id'], storage_name)) continue # Delete all files for this dataset for file_instance in tantalus_api.get_dataset_file_instances( dataset['id'], dataset_type, storage_name): if dry_run: logging.info( "would delete file instance with id {}, filepath {}". format(file_instance['id'], file_instance['filepath'])) else: logging.info( "deleting file instance with id {}, filepath {}".format( file_instance['id'], file_instance['filepath'])) tantalus_api.update("file_instance", id=file_instance['id'], is_deleted=True) total_data_size += file_instance['file_resource']['size'] file_num_count += 1 logging.info("deleted a total of {} files with size {} bytes".format( file_num_count, total_data_size))