def glob_cellenone_data(filepaths, storage_name, tag_name=None, update=False, remote_storage_name=None): tantalus_api = TantalusApi() for filepath in filepaths: match = re.match(r".*/single_cell_indexing/Cellenone/Cellenone_images/(\d+)_(A\d+[A-Z]*)", filepath) if match is None: logging.warning('skipping malformed {}'.format(filepath)) continue fields = match.groups() date = fields[0] library_id = fields[1] try: tantalus_api.get('dna_library', library_id=library_id) except NotFoundError: logging.warning('skipping file with unknown library {}'.format(filepath)) continue try: process_cellenone_images( library_id, filepath, storage_name, tag_name=tag_name, update=update, remote_storage_name=remote_storage_name, ) except ValueError: logging.exception(f'unable to process {library_id}, {filepath}')
def process_cellenone_images( library_id, source_dir, storage_name, tag_name=None, update=False, remote_storage_name=None, ): tantalus_api = TantalusApi() results_name = 'CELLENONE_IMAGES_{}'.format(library_id) results_type = 'CELLENONE_IMAGES' results_version = 'v1' try: existing_results = tantalus_api.get('results', name=results_name) except NotFoundError: existing_results = None if existing_results is not None and not update: logging.info(f'results for {library_id} exist, not processing') return storage = tantalus_api.get('storage', name=storage_name) storage_directory = storage['storage_directory'] destination_dir = os.path.join( storage_directory, 'single_cell_indexing', 'Cellenone', 'Cellenone_processed', library_id, results_version, ) try: os.makedirs(destination_dir) except: pass with tempfile.TemporaryDirectory() as temp_dir: filepaths = catalog_images(library_id, source_dir, destination_dir, temp_dir) results_dataset = add_generic_results( filepaths=filepaths, storage_name=storage_name, results_name=results_name, results_type=results_type, results_version=results_version, library_ids=[library_id], recursive=False, tag_name=tag_name, update=update, remote_storage_name=remote_storage_name, )
def cache_tagged_datasets(tag_name, from_storage_name, cache_directory, suffix_filter=None): """ Cache a set of tagged datasets """ tantalus_api = TantalusApi() tag = tantalus_api.get("tag", name=tag_name) for dataset_id in tag['sequencedataset_set']: cache_dataset(tantalus_api, dataset_id, "sequencedataset", from_storage_name, cache_directory, suffix_filter=suffix_filter) for dataset_id in tag['resultsdataset_set']: cache_dataset(tantalus_api, dataset_id, "resultsdataset", from_storage_name, cache_directory, suffix_filter=suffix_filter)
def add_microscope_results(filepaths, chip_id, library_ids, storage_name, tag_name=None, update=False, remote_storage_name=None): colossus_api = ColossusApi() tantalus_api = TantalusApi() results_name = 'MICROSCOPE_{}'.format(chip_id) results_type = 'MICROSCOPE' results_version = None try: existing_results = tantalus_api.get('results', name=results_name) except NotFoundError: existing_results = None if existing_results is not None and not update: logging.info(f'results for {chip_id} exist, not processing') return results_dataset = add_generic_results( filepaths=filepaths, storage_name=storage_name, results_name=results_name, results_type=results_type, results_version=results_version, library_ids=library_ids, recursive=True, tag_name=tag_name, update=update, remote_storage_name=remote_storage_name, )
def add_generic_dataset(**kwargs): tantalus_api = TantalusApi() file_resource_pks = [] sample = tantalus_api.get("sample", sample_id=kwargs['sample_id']) library = tantalus_api.get("dna_library", library_id=kwargs['library_id']) #Add the file resource to tantalus for filepath in kwargs['filepaths']: logging.info( "Adding file resource for {} to Tantalus".format(filepath)) resource, instance = tantalus_api.add_file( storage_name=kwargs['storage_name'], filepath=filepath, update=kwargs['update']) file_resource_pks.append(resource["id"]) if "tag_name" in kwargs: tag = tantalus_api.get("tag", name=kwargs["tag_name"]) tags = [tag["id"]] else: tags = [] ref_genome = kwargs.get("reference_genome") aligner = kwargs.get("aligner") if "sequence_lane_pks" in kwargs: sequence_pks = map(str, kwargs["sequence_lane_pks"]) #Add the dataset to tantalus sequence_dataset = tantalus_api.get_or_create( "sequence_dataset", name=kwargs['dataset_name'], dataset_type=kwargs['dataset_type'], sample=sample["id"], library=library["id"], sequence_lanes=sequence_pks, file_resources=file_resource_pks, reference_genome=ref_genome, aligner=aligner, tags=tags, ) logging.info("Succesfully created sequence dataset with ID {}".format( sequence_dataset["id"]))
def glob_cellenone_data(filepaths, storage_name, tag_name=None, update=False, skip_existing=False, remote_storage_name=None): tantalus_api = TantalusApi() library_paths = collections.defaultdict(set) for filepath in filepaths: match = re.match( r".*/single_cell_indexing/Cellenone/Cellenone_images/(\d+)_(A\d+[A-Z]*)/?$", filepath) if match is None: logging.warning('skipping malformed {}'.format(filepath)) continue fields = match.groups() date = fields[0] library_id = fields[1] try: tantalus_api.get('dna_library', library_id=library_id) except NotFoundError: logging.warning( 'skipping file with unknown library {}'.format(filepath)) continue logging.info(f'queueing library {library_id} data from {filepath}') library_paths[library_id].add(filepath) for library_id in library_paths: add_cellenone_results( library_paths[library_id], library_id, storage_name, tag_name=tag_name, update=update, skip_existing=skip_existing, remote_storage_name=remote_storage_name, )
def transfer_tagged_datasets(tag_name, from_storage_name, to_storage_name): """ Transfer a set of tagged datasets """ tantalus_api = TantalusApi() tag = tantalus_api.get("tag", name=tag_name) for dataset_id in tag['sequencedataset_set']: transfer_dataset(tantalus_api, dataset_id, "sequencedataset", from_storage_name, to_storage_name) for dataset_id in tag['resultsdataset_set']: transfer_dataset(tantalus_api, dataset_id, "resultsdataset", from_storage_name, to_storage_name)
def catalog_cellenone_dataset(library_id, storage_name, tag_name=None, update=False, remote_storage_name=None): tantalus_api = TantalusApi() dataset = tantalus_api.get('resultsdataset', results_type='CELLENONE', libraries__library_id=library_id) process_cellenone_dataset(dataset, storage_name, tag_name=tag_name, update=update, remote_storage_name=remote_storage_name)
def main(storage_name, bam_file_path, **kwargs): """ Imports the bam into tantalus by creating a sequence dataset and file resources """ logging.basicConfig(format=LOGGING_FORMAT, stream=sys.stderr, level=logging.INFO) tantalus_api = TantalusApi() sample = None if kwargs.get('sample_id') is not None: sample = tantalus_api.get_or_create( 'sample', sample_id=kwargs['sample_id'], ) library = None if kwargs.get('library_id') is not None: if kwargs.get('library_type') is not None and kwargs.get( 'index_format') is not None: library = tantalus_api.get_or_create( 'dna_library', library_id=kwargs['library_id'], library_type=kwargs['library_type'], index_format=kwargs['index_format'], ) else: library = tantalus_api.get( 'dna_library', library_id=kwargs['library_id'], ) dataset = import_bam( storage_name, bam_file_path, sample=sample, library=library, read_type=kwargs.get('read_type'), ref_genome=kwargs.get('ref_genome'), update=kwargs.get('update'), tag_name=kwargs.get('tag_name'), ) print("dataset {}".format(dataset["id"]))
def fix_bams(jira_ticket=None, dry_run=False): tantalus_api = TantalusApi() analyses_list = [] storage_name = "singlecellresults" if jira_ticket is not None: analyses_list.append(tantalus_api.get('analysis', jira_ticket=jira_ticket, analysis_type__name="align", status="complete")) else: # Get all completed align analyses ran with specific version # the bams associated to these analyses are in the wrong storage account for version in ('v0.5.2', 'v0.5.3'): analyses = tantalus_api.list('analysis', analysis_type__name="align", status="complete", version=version) analyses_list += [a for a in analyses] for analysis in analyses_list: jira_ticket = analysis["jira_ticket"] filename = f'{jira_ticket}/results/bams/metadata.yaml' logging.info(f'adding file {filename}') if not dry_run: file_instance, file_resource = tantalus_api.add_file(storage_name, filename) # get all bam datasets associated with the jira ticket bam_datasets = tantalus_api.list( "sequencedataset", dataset_type="BAM", analysis__jira_ticket=jira_ticket, ) for dataset in bam_datasets: dataset_id = dataset['id'] logging.info(f'adding file to dataset {dataset_id}') if not dry_run: file_resource_ids = dataset['file_resources'] file_resource_ids = file_resource_ids.append(file_resource['id']) tantalus_api.update('sequencedataset', id=dataset['id'], file_resources=file_resource_ids)
def add_cellenone_results(filepaths, library_id, storage_name, tag_name=None, update=False, skip_existing=False, remote_storage_name=None): colossus_api = ColossusApi() tantalus_api = TantalusApi() results_name = 'CELLENONE_{}'.format(library_id) results_type = 'CELLENONE' results_version = None try: existing_results = tantalus_api.get('resultsdataset', name=results_name, results_type=results_type) except NotFoundError: existing_results = None if skip_existing and existing_results is not None: return existing_results results_dataset = add_generic_results( filepaths=filepaths, storage_name=storage_name, results_name=results_name, results_type=results_type, results_version=results_version, library_ids=[library_id], recursive=True, tag_name=tag_name, update=update, remote_storage_name=remote_storage_name, ) return results_dataset
def create_fastq_metadata_yaml(library_id, storage_name, dry_run=False): """ Create a metadata.yaml file for a all FQ datasets for a library id. """ tantalus_api = TantalusApi() storage = tantalus_api.get_storage(storage_name) client = tantalus_api.get_storage_client(storage_name) for dataset_info, metadata in create_lane_fastq_metadata( tantalus_api, library_id): metadata_filename = os.path.join(dataset_info['base_dir'], 'metadata.yaml') metadata_filepath = tantalus_api.get_filepath(storage_name, metadata_filename) metadata_io = io.BytesIO() metadata_io.write( yaml.dump(metadata, default_flow_style=False).encode()) logging.info(f'writing metadata to file {metadata_filepath}') client.write_data(metadata_filename, metadata_io) logging.info(f'adding {metadata_filepath} to tantalus') if not dry_run: file_resource, file_instance = tantalus_api.add_file( storage_name, metadata_filepath, update=True) for dataset_id in dataset_info['dataset_ids']: dataset = tantalus_api.get('sequencedataset', id=dataset_id) new_file_resources = set(dataset['file_resources']) new_file_resources.add(file_resource['id']) tantalus_api.update('sequencedataset', id=dataset_id, file_resources=list(new_file_resources))
def catalog_cellenone_datasets(storage_name, tag_name=None, update=False, remote_storage_name=None): tantalus_api = TantalusApi() for dataset in tantalus_api.list('resultsdataset', results_type='CELLENONE'): # HACK: Check for metadata yaml file in dataset found_metadata = False try: file_resource = tantalus_api.get( 'file_resource', resultsdataset__id=dataset['id'], filename__endswith='metadata.yaml') found_metadata = True except NotFoundError: logging.info(f"no metadata for dataset {dataset['id']}") if found_metadata: logging.info( f"found metadata for dataset {dataset['id']}, skipping") continue try: process_cellenone_dataset(dataset, storage_name, tag_name=tag_name, update=update, remote_storage_name=remote_storage_name) except KeyboardInterrupt: raise except: logging.exception(f"catalog failed for dataset {dataset['id']}")
def add_fastq_metadata_yaml(dataset_id, storage_name, dry_run=False): """ Create a metadata.yaml file for a dataset and add to tantalus. """ tantalus_api = TantalusApi() storage = tantalus_api.get_storage(storage_name) client = tantalus_api.get_storage_client(storage_name) metadata, base_dir = create_lane_fastq_metadata(tantalus_api, dataset_id) metadata_filename = os.path.join(base_dir, 'metadata.yaml') metadata_filepath = tantalus_api.get_filepath(storage_name, metadata_filename) metadata_io = io.BytesIO() metadata_io.write(yaml.dump(metadata, default_flow_style=False).encode()) print(f'writing metadata to file {metadata_filepath}') client.write_data(metadata_filename, metadata_io) print(f'adding {metadata_filepath} to tantalus') if not dry_run: file_resource, file_instance = tantalus_api.add_file(storage_name, metadata_filepath, update=True) dataset = tantalus_api.get('sequencedataset', id=dataset_id) new_file_resources = set(dataset['file_resources']) new_file_resources.add(file_resource['id']) tantalus_api.update('sequencedataset', id=dataset_id, file_resources=list(new_file_resources))
def main( storage_name, dataset_type=None, dataset_id=None, tag_name=None, all_file_instances=False, dry_run=False, fix_corrupt=False, remove_missing=False, ): logging.info('checking integrity of storage {}'.format(storage_name)) tantalus_api = TantalusApi() if all_file_instances: file_resources = tantalus_api.list( 'file_resource', fileinstance__storage__name=storage_name) else: file_resources = get_dataset_file_instances(tantalus_api, dataset_type, dataset_id=dataset_id, tag_name=tag_name) for file_resource in file_resources: try: file_instance = tantalus_api.get('file_instance', file_resource=file_resource['id'], storage__name=storage_name) except NotFoundError: logging.exception( f'file {file_resource["filename"]} not on storage') continue logging.info('checking file instance {} with path {}'.format( file_instance['id'], file_instance['filepath'])) if file_instance['is_deleted']: logging.info('file instance {} marked as deleted'.format( file_instance['id'])) continue file_corrupt = False file_missing = False try: tantalus_api.check_file(file_instance) except DataCorruptionError: file_corrupt = True logging.exception('check file failed') except DataMissingError: file_missing = True logging.exception('missing file') if file_corrupt and fix_corrupt: logging.info('updating file instance {} with path {}'.format( file_instance['id'], file_instance['filepath'])) if not dry_run: tantalus_api.update_file(file_instance) if file_missing and remove_missing: logging.info('deleting file instance {} with path {}'.format( file_instance['id'], file_instance['filepath'])) if not dry_run: file_instance = tantalus_api.update( 'file_instance', id=file_instance['id'], is_deleted=True, )
def process_cellenone_images( library_id, source_dir, storage_name, tag_name=None, update=False, remote_storage_name=None, ): """ Catalog cellenone images for a library and add to tantalus. Args: library_id (str): library id associated with the images source_dir (str): source cellenone directory storage_name (str): local storage in which to organize the results KwArgs: tag_name (str): tantalus tag update (bool): update and possibly overwrite an existing dataset in tantalus remote_storage_name (str): upload to a remote storage """ tantalus_api = TantalusApi() results_name = 'CELLENONE_IMAGES_{}'.format(library_id) results_type = 'CELLENONE_IMAGES' results_version = 'v1' try: existing_results = tantalus_api.get('results', name=results_name) except NotFoundError: existing_results = None if existing_results is not None and not update: logging.info(f'results for {library_id} exist, not processing') return storage = tantalus_api.get('storage', name=storage_name) storage_directory = storage['storage_directory'] destination_dir = os.path.join( storage_directory, 'single_cell_indexing', 'Cellenone', 'Cellenone_processed', library_id, results_version, ) try: os.makedirs(destination_dir) except: pass with tempfile.TemporaryDirectory() as temp_dir: filepaths = catalog_images(library_id, source_dir, destination_dir, temp_dir) results_dataset = add_generic_results( filepaths=filepaths, storage_name=storage_name, results_name=results_name, results_type=results_type, results_version=results_version, library_ids=[library_id], recursive=False, tag_name=tag_name, update=update, remote_storage_name=remote_storage_name, )
def fix(): tantalus_api = TantalusApi() datasets = list( tantalus_api.list( 'sequence_dataset', dataset_type='BAM', library__library_type__name='WGS', )) for dataset in datasets: bams = {} bais = {} specs = {} for file_resource_id in dataset['file_resources']: file_resource = tantalus_api.get('file_resource', id=file_resource_id) if file_resource['filename'].endswith('.bam'): bams[file_resource_id] = file_resource['filename'] elif file_resource['filename'].endswith('.spec'): specs[file_resource_id] = file_resource['filename'] elif file_resource['filename'].endswith('.bam.bai'): bais[file_resource_id] = file_resource['filename'] if len(bams) == 0 and len(specs) == 0: print(dataset['id']) elif len(bams) > 1: logging.info(f"fixing {dataset['name']}, {bams}") to_remove_bam_id = max(bams.keys()) to_remove_bai_id = None for id_, bai in bais.items(): if bai.startswith(bams[to_remove_bam_id]): assert to_remove_bai_id is None to_remove_bai_id = id_ break assert to_remove_bai_id is not None logging.info((to_remove_bam_id, bams[to_remove_bam_id], to_remove_bai_id, bais[to_remove_bai_id])) new_file_resources = dataset['file_resources'] new_file_resources.remove(to_remove_bam_id) new_file_resources.remove(to_remove_bai_id) logging.info( f"updating {dataset['id']} to have files {new_file_resources}") tantalus_api.update('sequencedataset', id=dataset['id'], file_resources=new_file_resources) assert dataset["name"].endswith(str(dataset["version_number"])) similar_datasets = list( tantalus_api.list( "sequence_dataset", name=dataset["name"], )) new_version_number = max(d['version_number'] for d in similar_datasets) + 1 new_dataset_params = dict( sample=dataset['sample']['id'], library=dataset['library']['id'], sequence_lanes=[l['id'] for l in dataset['sequence_lanes']], aligner=dataset['aligner'], reference_genome=dataset['reference_genome'], name=dataset['name'][:-1] + str(new_version_number), dataset_type=dataset['dataset_type'], version_number=new_version_number, file_resources=[to_remove_bam_id, to_remove_bai_id], ) logging.info(new_dataset_params) new_dataset = tantalus_api.create('sequencedataset', **new_dataset_params) logging.info(new_dataset)
def run_h5_convert(cache_dir, dataset_id=None, results_type=None, redo=False, dry_run=False, check_done=False): tantalus_api = TantalusApi() local_cache_client = tantalus_api.get_cache_client(cache_dir) remote_storage_client = tantalus_api.get_storage_client( remote_storage_name) if dataset_id is not None: results_list = [tantalus_api.get("resultsdataset", id=dataset_id)] logging.info('converting results with id {}'.format(dataset_id)) elif results_type is not None: results_list = tantalus_api.list("resultsdataset", results_type=results_type) logging.info( 'converting results with results type {}'.format(results_type)) else: results_list = tantalus_api.list("resultsdataset") logging.info('converting all results') for result in results_list: logging.info('processing results dataset {}'.format(result['id'])) try: file_instances = tantalus_api.get_dataset_file_instances( result["id"], "resultsdataset", remote_storage_name, ) existing_filenames = set( [i['file_resource']['filename'] for i in file_instances]) found_csv_yaml = False for existing_filename in existing_filenames: # Destruct outputs csv.yaml directly, check non destruct files if 'destruct' in existing_filename: continue if existing_filename.endswith('.csv.gz.yaml'): found_csv_yaml = True break if found_csv_yaml and check_done: logging.info('found filename {}, skipping conversion'.format( existing_filename)) continue file_resource_ids = [] filepaths_to_clean = [] for file_instance in file_instances: if not file_instance['file_resource']['filename'].endswith( '.h5'): continue datamanagement.transfer_files.cache_file( tantalus_api, file_instance, cache_dir) h5_filepath = local_cache_client.get_url( file_instance['file_resource']['filename']) filepaths_to_clean.append(h5_filepath) logging.info('converting {}'.format(h5_filepath)) for key, csv_filepath in get_h5_csv_info(h5_filepath): if not csv_filepath.startswith(cache_dir): raise Exception( 'unexpected csv path {}'.format(csv_filepath)) csv_filename = csv_filepath[len(cache_dir):] csv_filename = csv_filename.lstrip('/') if csv_filename in existing_filenames and not redo: logging.info( 'file {} already exists, not converting'.format( csv_filename)) continue if dry_run: logging.info('would convert {}, key {} to {}'.format( h5_filepath, key, csv_filepath)) continue logging.info('converting {}, key {} to {}'.format( h5_filepath, key, csv_filepath)) convert_h5(h5_filepath, key, csv_filepath) yaml_filename = csv_filename + '.yaml' yaml_filepath = csv_filepath + '.yaml' fileinfo_to_add = [ (csv_filename, csv_filepath), (yaml_filename, yaml_filepath), ] for filename, filepath in fileinfo_to_add: logging.info('creating file {} from path {}'.format( filename, filepath)) remote_storage_client.create(filename, filepath, update=redo) remote_filepath = os.path.join( remote_storage_client.prefix, filename) logging.info('adding file {} from path {}'.format( filename, remote_filepath)) (file_resource, file_instance) = tantalus_api.add_file( remote_storage_name, remote_filepath, update=True) #redo) file_resource_ids.append(file_resource["id"]) filepaths_to_clean.append(filepath) if len(file_resource_ids) == 0: logging.warning('no files added') continue logging.info('adding file resources {} to dataset {}'.format( file_resource_ids, result["id"])) tantalus_api.update( "resultsdataset", result["id"], file_resources=result["file_resources"] + file_resource_ids, ) for filepath in filepaths_to_clean: logging.info('removing file {}'.format(filepath)) os.remove(filepath) except NotFoundError: logging.exception('no files found for conversion') except KeyboardInterrupt: raise except Exception: logging.exception('conversion failed')
def main(storage_name, dlp_library_id=None, internal_id=None, tag_name=None, all=False, update=False, check_library=False, dry_run=False): # Set up the root logger logging.basicConfig(format=LOGGING_FORMAT, stream=sys.stderr, level=logging.INFO) # Connect to the Tantalus API (this requires appropriate environment) colossus_api = ColossusApi() tantalus_api = TantalusApi() # initiate arrays to store successful and failed libraries successful_libs = [] failed_libs = [] storage = tantalus_api.get("storage", name=storage_name) sequencing_list = list() if dry_run: logging.info("This is a dry run. No lanes will be imported.") # Importing a single library if dlp_library_id is not None: sequencing_list = list( colossus_api.list('sequencing', sequencing_center='BCCAGSC', library__pool_id=dlp_library_id)) # importing all libraries from the gsc elif all: sequencing_list = list( colossus_api.list('sequencing', sequencing_center='BCCAGSC')) # importing only sequencing expecting more lanes else: sequencing_list = list( colossus_api.list('sequencing', sequencing_center='BCCAGSC')) sequencing_list = list( filter( lambda s: s['number_of_lanes_requested'] != len(s[ 'dlplane_set']), sequencing_list)) for sequencing in sequencing_list: # import library try: import_info = import_gsc_dlp_paired_fastqs( colossus_api, tantalus_api, sequencing, storage, internal_id, tag_name, update=update, check_library=check_library, dry_run=dry_run, ) # check if no import information exists, if so, library does not exist on GSC if import_info is None: lane_requested_date = sequencing["lane_requested_date"] failed_libs.append( dict( dlp_library_id=sequencing["library"], gsc_library_id="None", lane_requested_date=lane_requested_date, error="Doesn't exist on GSC", )) continue # check if library excluded from import elif import_info is False: continue # update lanes in sequencing update_colossus_lane(colossus_api, sequencing, import_info['lanes']) # get sequencing object again since sequencing may have with new info updated_sequencing = colossus_api.get("sequencing", id=sequencing["id"]) # check if lanes have been imported check_lanes(colossus_api, updated_sequencing, len(updated_sequencing["dlplane_set"])) # add lane_requested_date to import info for import status report import_info['lane_requested_date'] = sequencing[ 'lane_requested_date'] # add library to list of succesfully imported libraries successful_libs.append(import_info) # create jira ticket and analyses with new lanes and datasets create_tickets_and_analyses(import_info) except Exception as e: # add lane_requested_date to import info for import status report lane_requested_date = sequencing["lane_requested_date"] updated_sequencing = colossus_api.get("sequencing", id=sequencing["id"]) # add library to list of libraries that failed to import failed_libs.append( dict( dlp_library_id=sequencing["library"], gsc_library_id=updated_sequencing["gsc_library_id"], lane_requested_date=lane_requested_date, error=str(e), )) logging.exception( f"Library {sequencing['library']} failed to import: {e}") continue # Only write import statuses for bulk imports if all or dlp_library_id is None: # Sort lists by date in descending order successful_libs.sort( key=lambda x: datetime.datetime.strptime(x['lane_requested_date'], '%Y-%m-%d'), reverse=True, ) failed_libs.sort( key=lambda x: datetime.datetime.strptime(x['lane_requested_date'], '%Y-%m-%d'), reverse=True, ) # write import report write_import_statuses(successful_libs, failed_libs)
def rename_fastqs(dataset_id, storage_name, dry_run=False, check_only=False): logging.info(f'dataset: {dataset_id}') logging.info(f'dry run: {dry_run}') tantalus_api = TantalusApi() storage_client = tantalus_api.get_storage_client(storage_name) dataset = tantalus_api.get('sequencedataset', id=dataset_id) file_instances = tantalus_api.get_dataset_file_instances( dataset['id'], 'sequencedataset', storage_name, ) for file_instance in file_instances: filename = file_instance['file_resource']['filename'] if os.path.basename(filename) == 'metadata.yaml': continue assert len(dataset['sequence_lanes']) == 1 parts = filename.split('/') basename = os.path.basename(filename) non_conforming = False try: assert parts[0] == 'single_cell_indexing' assert parts[1] == 'fastq' assert parts[2] == dataset['library']['library_id'] assert parts[3].split( '_')[0] == dataset['sequence_lanes'][0]['flowcell_id'] assert parts[3].split( '_')[1] == dataset['sequence_lanes'][0]['lane_number'] assert parts[4] == dataset['sample']['sample_id'] except AssertionError: non_conforming = True if check_only: if non_conforming: raise Exception(f'filename {filename} does not conform') continue new_filename = SC_WGS_FQ_TEMPLATE.format( dlp_library_id=dataset['library']['library_id'], flowcell_id=dataset['sequence_lanes'][0]['flowcell_id'], lane_number=dataset['sequence_lanes'][0]['lane_number'], cell_sample_id=dataset['sample']['sample_id'], cell_filename=basename, ) if new_filename == filename: logging.info(f'skipping conforming {filename} on {storage_name}') continue logging.info( f'renaming {filename} to {new_filename} on {storage_name}') if not dry_run: if not storage_client.exists(new_filename): storage_client.copy(filename, new_filename, wait=True) tantalus_api.swap_file(file_instance, new_filename) storage_client.delete(filename)
if __name__ == '__main__': logging.basicConfig(format=LOGGING_FORMAT, stream=sys.stderr, level=logging.INFO) gsc_api = GSCAPI() tantalus_api = TantalusApi() # List of relevant libraries from GSC lanes lanes = list(tantalus_api.list('sequencing_lane', sequencing_centre='GSC')) libraries = set() for lane in lanes: library = tantalus_api.get('dna_library', id=lane['dna_library']) if library['library_type'] == 'WGS': libraries.add(library['library_id']) lane_fixes = [] for library_id in libraries: infos = gsc_api.query("library?name={}".format(library_id)) if len(infos) == 0: logging.warning('unable to find {}'.format(library_id)) elif len(infos) > 1: raise Exception('found {} libraries for {}'.format( len(infos), library_id))
def main( storage_name, dry_run=False, check_remote=None, ): tantalus_api = TantalusApi() storage_client = tantalus_api.get_storage_client(storage_name) remote_storage_client = None if check_remote is not None: remote_storage_client = tantalus_api.get_storage_client(check_remote) file_instances = tantalus_api.list('file_instance', storage__name=storage_name, is_deleted=True) # DEBUG: check whether we are getting back # consistent ordered results from tantalus file_instances = list(file_instances) file_instance_ids = set([f['id'] for f in file_instances]) if len(file_instances) != len(file_instance_ids): raise Exception('received duplicate results from tantalus') logging.info('processing {} file instances'.format(len(file_instance_ids))) logging.info('processing the following file instances: {}'.format( str(file_instance_ids))) for file_instance in file_instances: file_resource = tantalus_api.get( 'file_resource', id=file_instance['file_resource']['id']) all_file_instances = list( tantalus_api.list('file_instance', file_resource=file_resource['id'])) logging.info( 'checking file instance {}, file resource {}, filepath {}'.format( file_instance['id'], file_resource['id'], file_instance['filepath'])) sequencedatasets = tantalus_api.list( 'sequencedataset', file_resources__id=file_resource['id']) resultsdatasets = tantalus_api.list( 'resultsdataset', file_resources__id=file_resource['id']) sequencedataset_ids = list(set([a['id'] for a in sequencedatasets])) resultsdataset_ids = list(set([a['id'] for a in resultsdatasets])) logging.info( 'file resource {} belongs to sequencedataset {} and resultsdatasets {}' .format(file_resource['id'], sequencedataset_ids, resultsdataset_ids)) # Optionally check for a remote version if remote_storage_client: remote_instance = None for other_instance in file_resource['file_instances']: if other_instance['storage']['name'] == check_remote: remote_instance = other_instance if not remote_instance: logging.info( 'not deleting file instance {}, no other instance'.format( file_instance['id'])) continue if remote_instance['is_deleted']: logging.info( 'not deleting file instance {}, other instance {} deleted'. format(file_instance['id'], other_instance['id'])) continue if not remote_storage_client.exists(file_resource['filename']): logging.info( 'not deleting file instance {}, other instance {} doesnt exist' .format(file_instance['id'], other_instance['id'])) continue logging.info( 'deletion ok for file instance {}, found other instance {}'. format(file_instance['id'], other_instance['id'])) # Delete the file from the filesystem logging.info('deleting file {}'.format(file_instance['filepath'])) if not dry_run: try: storage_client.delete(file_resource['filename']) except FileNotFoundError: logging.exception('file already deleted') # Delete the instance model from tantalus logging.info('deleting file instance {}'.format(file_instance['id'])) if not dry_run: tantalus_api.delete('file_instance', id=file_instance['id']) # If this is the only file instance for this file resource, delete the file resource if len(all_file_instances) == 1: assert all_file_instances[0]['id'] == file_instance['id'] logging.info('deleting file resource {}'.format( file_resource['id'])) if not dry_run: tantalus_api.delete('file_resource', id=file_resource['id'])
def fix_bams(jira_ticket=None, dry_run=False): logging.info(f'dry run: {dry_run}') tantalus_api = TantalusApi() SC_WGS_BAM_DIR_TEMPLATE = os.path.join( 'single_cell_indexing', 'bam', '{library_id}', '{ref_genome}', '{aligner_name}', 'numlanes_{number_lanes}', '{jira_ticket}', ) reference_genome_map = { 'HG19': 'grch37', 'MM10': 'mm10', } analyses_list = [] from_storage_name = "singlecellresults" to_storage_name = "singlecellblob" from_storage_client = tantalus_api.get_storage_client(from_storage_name) to_storage_client = tantalus_api.get_storage_client(to_storage_name) to_storage_id = tantalus_api.get('storage', name=to_storage_name)['id'] if jira_ticket is not None: analyses_list.append( tantalus_api.get('analysis', jira_ticket=jira_ticket, analysis_type__name="align", status="complete")) else: # Get all completed align analyses ran with specific version # the bams associated to these analyses are in the wrong storage account for version in ('v0.5.2', 'v0.5.3', 'v0.5.4'): analyses = tantalus_api.list('analysis', analysis_type__name="align", status="complete", version=version) analyses_list += [a for a in analyses] for analysis in analyses_list: jira_ticket = analysis["jira_ticket"] print(f"moving bams for {jira_ticket}") # get all bam datasets associated with the jira ticket bam_datasets = tantalus_api.list( "sequencedataset", dataset_type="BAM", analysis__jira_ticket=jira_ticket, ) for dataset in bam_datasets: # Get number of lanes from dataset for use with filepath lanes = set() for sequence_lane in dataset['sequence_lanes']: lane = "{}_{}".format(sequence_lane['flowcell_id'], sequence_lane['lane_number']) lanes.add(lane) number_lanes = len(lanes) try: file_instances = tantalus_api.get_dataset_file_instances( dataset["id"], "sequencedataset", from_storage_name, ) except dbclients.tantalus.DataNotOnStorageError: logging.info( f'dataset {dataset["id"]} not on {from_storage_name}, skipping' ) continue for file_instance in file_instances: blobname = file_instance["file_resource"]["filename"] # get url of source blob blob_url = from_storage_client.get_url(blobname) bam_filename = blobname.split("/bams/")[1] new_blobname = os.path.join( SC_WGS_BAM_DIR_TEMPLATE.format( library_id=dataset["library"]["library_id"], ref_genome=reference_genome_map[ dataset["reference_genome"]], aligner_name=dataset["aligner"], number_lanes=number_lanes, jira_ticket=jira_ticket, ), bam_filename, ) # copy blob to desired storage account with new blobname blob_filepath = f"{to_storage_client.prefix}/{new_blobname}" logging.info( f'copying {new_blobname} to storage {to_storage_name} from {blob_url} to {blob_filepath}' ) if not dry_run: to_storage_client.blob_service.copy_blob( container_name="data", blob_name=new_blobname, copy_source=blob_url, ) file_resource_id = file_instance['file_resource']['id'] file_instance_id = file_instance['id'] logging.info( f'updating file resource {file_resource_id} to have filename {new_blobname}' ) if not dry_run: tantalus_api.update('file_resource', id=file_resource_id, filename=new_blobname) logging.info( f'updating file instance {file_instance_id} to have storage with id {to_storage_id}' ) if not dry_run: tantalus_api.update('file_instance', id=file_instance_id, storage=to_storage_id)
def import_bam(storage_name, bam_file_path, sample=None, library=None, lane_infos=None, read_type=None, ref_genome=None, tag_name=None, update=False): """ Imports bam into tantalus Args: storage_name: (string) name of destination storage bam_file_path: (string) filepath to bam on destination storage sample: (dict) contains sample_id library: (dict) contains library_id, library_type, index_format lane_infos: (dict) contains flowcell_id, lane_number, adapter_index_sequence, sequencing_cenre, read_type, reference_genome, aligner read_type: (string) read type for the run tag_name: (string) update: (boolean) Returns: sequence_dataset: (dict) sequence dataset created on tantalus """ tantalus_api = TantalusApi() # Get a url allowing access regardless of whether the file # is in cloud or local storage storage_client = tantalus_api.get_storage_client(storage_name) bam_filename = tantalus_api.get_file_resource_filename( storage_name, bam_file_path) bam_url = storage_client.get_url(bam_filename) bam_header = pysam.AlignmentFile(bam_url).header bam_header_info = get_bam_header_info(bam_header) if ref_genome is None: ref_genome = get_bam_ref_genome(bam_header) aligner_name = get_bam_aligner_name(bam_header) logging.info( f"bam header shows reference genome {ref_genome} and aligner {aligner_name}" ) bai_file_path = None if storage_client.exists(bam_filename + ".bai"): bai_file_path = bam_file_path + ".bai" else: logging.info(f"no bam index found at {bam_filename + '.bai'}") # If no sample was specified assume it exists in tantalus and # search for it based on header info if sample is None: if len(bam_header_info["sample_ids"]) != 1: raise ValueError( f"found sample_ids={bam_header_info['sample_ids']}, please specify override sample id" ) sample_id = list(bam_header_info["sample_ids"])[0] sample = tantalus_api.get('sample', sample_id=sample_id) # If no library was specified assume it exists in tantalus and # search for it based on header info if library is None: if len(bam_header_info["library_ids"]) != 1: raise ValueError( f"found library_ids={bam_header_info['library_ids']}, please specify override library id" ) library_id = list(bam_header_info["library_ids"])[0] library = tantalus_api.get('dna_library', library_id=library_id) # Default paired end reads if read_type is None: read_type = 'P' # If no lane infos were specified create them from header info if lane_infos is None: lane_infos = [] for lane in bam_header_info["sequence_lanes"]: lane_info = { "flowcell_id": lane["flowcell_id"], "lane_number": lane["lane_number"], "library_id": lane["library_id"], "sequencing_centre": lane["sequencing_centre"], "read_type": read_type, } lane_infos.append(lane_info) # Add the sequence dataset to Tantalus sequence_dataset = add_sequence_dataset( tantalus_api, storage_name=storage_name, sample=sample, library=library, dataset_type="BAM", sequence_lanes=lane_infos, bam_file_path=bam_file_path, reference_genome=ref_genome, aligner=aligner_name, bai_file_path=bai_file_path, tag_name=tag_name, update=update, ) return sequence_dataset
tantalus_api = TantalusApi() colossus_api = ColossusApi() if __name__ == '__main__': print "STARTING" colossus_analyses = colossus_api.list('analysis_information') tantalus_analyses = tantalus_api.list('analysis', analysis_type__name="align") analysis_lane_dict = {} for analysis in tantalus_analyses: lane_set = set() for input_dataset in analysis['input_datasets']: dataset = tantalus_api.get('sequencedataset',id=input_dataset) for lane in dataset['sequence_lanes']: lane_set.add(str(lane['flowcell_id'] + "_" + str(lane['lane_number']))) analysis_lane_dict[analysis['name']] = lane_set print analysis_lane_dict for analysis in colossus_analyses: key = analysis['analysis_jira_ticket'] + '_align' if key in analysis_lane_dict.keys(): lanes = [] print "ID" + str(analysis['id']) for lane in analysis_lane_dict[key]: try: print "Getting lane: " + lane
def add_generic_results(filepaths, storage_name, results_name, results_type, results_version, sample_ids=(), library_ids=(), analysis_pk=None, recursive=False, tag_name=None, update=False, remote_storage_name=None): tantalus_api = TantalusApi() sample_pks = [] for sample_id in sample_ids: samples = tantalus_api.get( "sample", sample_id=sample_id, ) sample_pks.append(samples['id']) library_pks = [] for library_id in library_ids: librarys = tantalus_api.get( "dna_library", library_id=library_id, ) library_pks.append(librarys['id']) #Add the file resource to tantalus file_resource_pks = [] for filepath in filepaths: if recursive: logging.info("Recursing directory {}".format(filepath)) add_filepaths = [] for (dirpath, dirnames, filenames) in os.walk(filepath): for filename in filenames: add_filepaths.append(os.path.join(dirpath, filename)) else: add_filepaths = [filepath] for add_filepath in add_filepaths: logging.info( "Adding file resource for {} to Tantalus".format(add_filepath)) resource, instance = tantalus_api.add_file( storage_name=storage_name, filepath=add_filepath, update=update, ) file_resource_pks.append(resource["id"]) results_dataset_fields = dict( name=results_name, results_type=results_type, results_version=results_version, analysis=analysis_pk, samples=sample_pks, libraries=library_pks, file_resources=file_resource_pks, ) #Add the dataset to tantalus try: results_id = tantalus_api.get( "results", name=results_dataset_fields["name"])["id"] except NotFoundError: results_id = None if update and results_id is not None: logging.warning("results dataset {} exists, updating".format( results_dataset_fields["name"])) results_dataset = tantalus_api.update("results", id=results_id, **results_dataset_fields) else: logging.info("creating results dataset {}".format( results_dataset_fields["name"])) results_dataset = tantalus_api.get_or_create("results", **results_dataset_fields) if tag_name is not None: tantalus_api.tag(tag_name, resultsdataset_set=[results_id]) logging.info("Succesfully created sequence dataset with ID {}".format( results_dataset["id"])) if remote_storage_name is not None: transfer_files.transfer_dataset(tantalus_api, results_dataset['id'], "resultsdataset", storage_name, remote_storage_name) return results_dataset