def add_microscope_results(filepaths, chip_id, library_ids, storage_name, tag_name=None, update=False, remote_storage_name=None): colossus_api = ColossusApi() tantalus_api = TantalusApi() results_name = 'MICROSCOPE_{}'.format(chip_id) results_type = 'MICROSCOPE' results_version = None try: existing_results = tantalus_api.get('results', name=results_name) except NotFoundError: existing_results = None if existing_results is not None and not update: logging.info(f'results for {chip_id} exist, not processing') return results_dataset = add_generic_results( filepaths=filepaths, storage_name=storage_name, results_name=results_name, results_type=results_type, results_version=results_version, library_ids=library_ids, recursive=True, tag_name=tag_name, update=update, remote_storage_name=remote_storage_name, )
def add_cellenone_results(filepaths, library_id, storage_name, tag_name=None, update=False, remote_storage_name=None): colossus_api = ColossusApi() tantalus_api = TantalusApi() results_name = 'CELLENONE_{}'.format(library_id) results_type = 'CELLENONE' results_version = None results_dataset = add_generic_results( filepaths=filepaths, storage_name=storage_name, results_name=results_name, results_type=results_type, results_version=results_version, library_ids=[library_id], recursive=True, tag_name=tag_name, update=update, remote_storage_name=remote_storage_name, )
def fastq_dlp_index_check(file_info): """ Check consistency between colossus indices and file indices. """ colossus_api = ColossusApi() # Assumption: only 1 library per imported set of fastqs dlp_library_ids = list(set([a['library_id'] for a in file_info])) if len(dlp_library_ids) != 1: raise ValueError( 'Expected 1 library_id, received {}'.format(dlp_library_ids)) dlp_library_id = dlp_library_ids[0] cell_samples = query_colossus_dlp_cell_info(colossus_api, dlp_library_id) cell_index_sequences = set(cell_samples.keys()) fastq_lane_index_sequences = collections.defaultdict(set) # Check that all fastq files refer to indices known in colossus for info in file_info: if info['index_sequence'] not in cell_index_sequences: raise Exception( 'fastq {} with index {}, flowcell {}, lane {} with index not in colossus' .format(info['filepath'], info['index_sequence'], info['sequence_lanes'][0]['flowcell_id'], info['sequence_lanes'][0]['lane_number'])) flowcell_lane = (info['sequence_lanes'][0]['flowcell_id'], info['sequence_lanes'][0]['lane_number']) fastq_lane_index_sequences[flowcell_lane].add(info['index_sequence']) log.info('all fastq files refer to indices known in colossus') # Check that all index sequences in colossus have fastq files for flowcell_lane in fastq_lane_index_sequences: for index_sequence in cell_index_sequences: if index_sequence not in fastq_lane_index_sequences[flowcell_lane]: raise Exception( 'no fastq found for index sequence {}, flowcell {}, lane {}' .format(index_sequence, flowcell_lane[0], flowcell_lane[1])) log.info('all indices in colossus have fastq files')
def add_cellenone_results(filepaths, library_id, storage_name, tag_name=None, update=False, skip_existing=False, remote_storage_name=None): colossus_api = ColossusApi() tantalus_api = TantalusApi() results_name = 'CELLENONE_{}'.format(library_id) results_type = 'CELLENONE' results_version = None try: existing_results = tantalus_api.get('resultsdataset', name=results_name, results_type=results_type) except NotFoundError: existing_results = None if skip_existing and existing_results is not None: return existing_results results_dataset = add_generic_results( filepaths=filepaths, storage_name=storage_name, results_name=results_name, results_type=results_type, results_version=results_version, library_ids=[library_id], recursive=True, tag_name=tag_name, update=update, remote_storage_name=remote_storage_name, ) return results_dataset
from collections import defaultdict from workflows.unanalyzed_data import * import datamanagement.templates as templates from dbclients.tantalus import TantalusApi from dbclients.colossus import ColossusApi from dbclients.basicclient import NotFoundError from workflows.utils import file_utils from workflows.utils import saltant_utils from workflows.utils.colossus_utils import get_ref_genome tantalus_api = TantalusApi() colossus_api = ColossusApi() log = logging.getLogger('sisyphus') log.setLevel(logging.DEBUG) stream_handler = logging.StreamHandler() formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') stream_handler.setFormatter(formatter) log.addHandler(stream_handler) log.propagate = False def get_sequencings(library_info): ''' Given library id (str), return list of sequencings '''
from dbclients.tantalus import TantalusApi from dbclients.colossus import ColossusApi import logging tantalus_api = TantalusApi() colossus_api = ColossusApi() if __name__ == '__main__': print "STARTING" colossus_analyses = colossus_api.list('analysis_information') tantalus_analyses = tantalus_api.list('analysis', analysis_type__name="align") analysis_lane_dict = {} for analysis in tantalus_analyses: lane_set = set() for input_dataset in analysis['input_datasets']: dataset = tantalus_api.get('sequencedataset',id=input_dataset) for lane in dataset['sequence_lanes']: lane_set.add(str(lane['flowcell_id'] + "_" + str(lane['lane_number']))) analysis_lane_dict[analysis['name']] = lane_set print analysis_lane_dict for analysis in colossus_analyses: key = analysis['analysis_jira_ticket'] + '_align' if key in analysis_lane_dict.keys(): lanes = []
def check_indices(library_id=None): tantalus_api = TantalusApi() colossus_api = ColossusApi() if library_id is None: library_ids = set([a['pool_id'] for a in colossus_api.list('library')]) else: library_ids = [library_id] for library_id in library_ids: # Get colossus sublibrary indices sublibraries = colossus_api.list('sublibraries', library__pool_id=library_id) colossus_indices = set( [a['primer_i7'] + '-' + a['primer_i5'] for a in sublibraries]) datasets = tantalus_api.list( 'sequence_dataset', library__library_id=library_id, library__library_type__name='SC_WGS', dataset_type='FQ', ) lane_datasets = collections.defaultdict(list) for dataset in datasets: assert len(dataset['sequence_lanes']) == 1 flowcell_lane = '_'.join([ dataset['sequence_lanes'][0]['flowcell_id'], dataset['sequence_lanes'][0]['lane_number'], ]) lane_datasets[flowcell_lane].append(dataset) for flowcell_lane in lane_datasets: # Get tantalus sublibraries and indices tantalus_indices = set() tantalus_dataset_ids = [] tantalus_sequencing_centre = set() for dataset in lane_datasets[flowcell_lane]: file_resources = list( tantalus_api.list('file_resource', sequencedataset__id=dataset['id'])) tantalus_indices.update( set([ a['sequencefileinfo']['index_sequence'] for a in file_resources ])) tantalus_dataset_ids.append(dataset['id']) tantalus_sequencing_centre.update([ a['sequencing_centre'] for a in dataset['sequence_lanes'] ]) assert len(tantalus_sequencing_centre) == 1 tantalus_sequencing_centre = list(tantalus_sequencing_centre)[0] if len(colossus_indices - tantalus_indices) > 0: print( 'library {}, datasets {}, lane {}, sequencing_centre {}: {} in colossus but not tantalus' .format(library_id, tantalus_dataset_ids, flowcell_lane, tantalus_sequencing_centre, len(colossus_indices - tantalus_indices))) if len(tantalus_indices - colossus_indices) > 0: print( 'library {}, datasets {}, lane {}, sequencing_centre {}: {} in tantalus but not colossus' .format(library_id, tantalus_dataset_ids, flowcell_lane, tantalus_sequencing_centre, len(tantalus_indices - colossus_indices))) if tantalus_indices == colossus_indices: print( 'library {}, datasets {}, lane {}, sequencing_centre {}: OK' .format(library_id, tantalus_dataset_ids, flowcell_lane, tantalus_sequencing_centre))
def main(storage_name, dlp_library_id=None, internal_id=None, tag_name=None, all=False, update=False, check_library=False, dry_run=False): # Set up the root logger logging.basicConfig(format=LOGGING_FORMAT, stream=sys.stderr, level=logging.INFO) # Connect to the Tantalus API (this requires appropriate environment) colossus_api = ColossusApi() tantalus_api = TantalusApi() # initiate arrays to store successful and failed libraries successful_libs = [] failed_libs = [] storage = tantalus_api.get("storage", name=storage_name) sequencing_list = list() if dry_run: logging.info("This is a dry run. No lanes will be imported.") # Importing a single library if dlp_library_id is not None: sequencing_list = list( colossus_api.list('sequencing', sequencing_center='BCCAGSC', library__pool_id=dlp_library_id)) # importing all libraries from the gsc elif all: sequencing_list = list( colossus_api.list('sequencing', sequencing_center='BCCAGSC')) # importing only sequencing expecting more lanes else: sequencing_list = list( colossus_api.list('sequencing', sequencing_center='BCCAGSC')) sequencing_list = list( filter( lambda s: s['number_of_lanes_requested'] != len(s[ 'dlplane_set']), sequencing_list)) for sequencing in sequencing_list: # import library try: import_info = import_gsc_dlp_paired_fastqs( colossus_api, tantalus_api, sequencing, storage, internal_id, tag_name, update=update, check_library=check_library, dry_run=dry_run, ) # check if no import information exists, if so, library does not exist on GSC if import_info is None: lane_requested_date = sequencing["lane_requested_date"] failed_libs.append( dict( dlp_library_id=sequencing["library"], gsc_library_id="None", lane_requested_date=lane_requested_date, error="Doesn't exist on GSC", )) continue # check if library excluded from import elif import_info is False: continue # update lanes in sequencing update_colossus_lane(colossus_api, sequencing, import_info['lanes']) # get sequencing object again since sequencing may have with new info updated_sequencing = colossus_api.get("sequencing", id=sequencing["id"]) # check if lanes have been imported check_lanes(colossus_api, updated_sequencing, len(updated_sequencing["dlplane_set"])) # add lane_requested_date to import info for import status report import_info['lane_requested_date'] = sequencing[ 'lane_requested_date'] # add library to list of succesfully imported libraries successful_libs.append(import_info) # create jira ticket and analyses with new lanes and datasets create_tickets_and_analyses(import_info) except Exception as e: # add lane_requested_date to import info for import status report lane_requested_date = sequencing["lane_requested_date"] updated_sequencing = colossus_api.get("sequencing", id=sequencing["id"]) # add library to list of libraries that failed to import failed_libs.append( dict( dlp_library_id=sequencing["library"], gsc_library_id=updated_sequencing["gsc_library_id"], lane_requested_date=lane_requested_date, error=str(e), )) logging.exception( f"Library {sequencing['library']} failed to import: {e}") continue # Only write import statuses for bulk imports if all or dlp_library_id is None: # Sort lists by date in descending order successful_libs.sort( key=lambda x: datetime.datetime.strptime(x['lane_requested_date'], '%Y-%m-%d'), reverse=True, ) failed_libs.sort( key=lambda x: datetime.datetime.strptime(x['lane_requested_date'], '%Y-%m-%d'), reverse=True, ) # write import report write_import_statuses(successful_libs, failed_libs)
def create_lane_fastq_metadata(tantalus_api, dataset_id): """ Get meatadata per lane of sequencing for a given dataset. """ colossus_api = ColossusApi() dataset = tantalus_api.get("sequencedataset", id=dataset_id) library_id = dataset['library']['library_id'] sample_id = dataset['sample']['sample_id'] assert len(dataset['sequence_lanes']) == 1 flowcell_id = dataset['sequence_lanes'][0]['flowcell_id'] lane_number = dataset['sequence_lanes'][0]['lane_number'] sample_info = generate_inputs.generate_sample_info(library_id) index_sequence_cell_id = sample_info.set_index( 'index_sequence')['cell_id'].to_dict() metadata = {'files': {}, 'meta': {}} metadata['meta']['type'] = DATASET_TYPE metadata['meta']['version'] = DATASET_VERSION metadata['meta']['sample_id'] = sample_id metadata['meta']['library_id'] = library_id base_dirs = set() cell_ids = set() file_resources = list( tantalus_api.list('file_resource', sequencedataset__id=dataset['id'])) for file_resource in file_resources: filename = os.path.basename(file_resource['filename']) dirname = os.path.dirname(file_resource['filename']) if filename.endswith('metadata.yaml'): continue index_sequence = file_resource['sequencefileinfo']['index_sequence'] cell_id = index_sequence_cell_id[index_sequence] read_end = file_resource['sequencefileinfo']['read_end'] if filename in metadata['files']: raise ValueError(f'duplicate filename {filename}') metadata['files'][filename] = { 'cell_id': cell_id, 'read_end': read_end, 'flowcell_id': flowcell_id, 'lane_number': lane_number, } base_dirs.add(dirname) cell_ids.add(cell_id) if len(base_dirs) != 1: raise ValueError( f'found files in zero or multiple directories {base_dirs}') assert not sample_info['cell_id'].duplicated().any() metadata['meta']['cells'] = {} for idx, row in sample_info.iterrows(): cell_id = row['cell_id'] if cell_id not in cell_ids: continue metadata['meta']['cells'][cell_id] = { 'library_id': row['library_id'], 'sample_id': row['sample_id'], 'pick_met': row['pick_met'], 'condition': row['condition'], 'sample_type': row['sample_type'], 'img_col': row['img_col'], 'row': row['row'], 'column': row['column'], 'primer_i5': row['primer_i5'], 'index_i5': row['index_i5'], 'primer_i7': row['primer_i7'], 'index_i7': row['index_i7'], 'index_sequence': row['index_sequence'], } metadata['meta']['lanes'] = { flowcell_id: { lane_number: { 'sequencing_centre': dataset['sequence_lanes'][0]['sequencing_centre'], 'sequencing_instrument': dataset['sequence_lanes'][0]['sequencing_instrument'], 'sequencing_library_id': dataset['sequence_lanes'][0]['sequencing_library_id'], 'read_type': dataset['sequence_lanes'][0]['read_type'], } } } return metadata, base_dirs.pop()
def create_lane_fastq_metadata(tantalus_api, library_id): """ Get meatadata per lane of sequencing for a given library. """ colossus_api = ColossusApi() sample_info = generate_inputs.generate_sample_info(library_id) index_sequence_cell_id = sample_info.set_index( 'index_sequence')['cell_id'].to_dict() datasets = list( tantalus_api.list("sequencedataset", dataset_type='FQ', library__library_id=library_id)) datasets_by_lane = collections.defaultdict(list) for dataset in datasets: assert len(dataset['sequence_lanes']) == 1 flowcell_id = dataset['sequence_lanes'][0]['flowcell_id'] lane_number = dataset['sequence_lanes'][0]['lane_number'] datasets_by_lane[(flowcell_id, lane_number)].append(dataset) for (flowcell_id, lane_number), lane_datasets in datasets_by_lane.items(): metadata = {'files': {}, 'meta': {}} metadata['meta']['type'] = DATASET_TYPE metadata['meta']['version'] = DATASET_VERSION dataset_ids = set() base_dirs = set() sequence_lane_ids = set() for dataset in lane_datasets: file_resources = list( tantalus_api.list('file_resource', sequencedataset__id=dataset['id'])) dataset_ids.add(dataset['id']) sequence_lane_ids.add(dataset['sequence_lanes'][0]['id']) for file_resource in file_resources: filename = os.path.basename(file_resource['filename']) # Find common directory as subdirectory ending with flowcell/lane flowcell_lane = f'{flowcell_id}_{lane_number}' flowcell_idx = file_resource['filename'].index(flowcell_lane + '/') flowcell_idx += len(flowcell_lane) base_dir = file_resource['filename'][:flowcell_idx] filename = file_resource['filename'][flowcell_idx + 1:] base_dirs.add(base_dir) index_sequence = file_resource['sequencefileinfo'][ 'index_sequence'] cell_id = index_sequence_cell_id[index_sequence] read_end = file_resource['sequencefileinfo']['read_end'] if filename in metadata: raise ValueError(f'duplicate filename {filename}') metadata['files'][filename] = { 'cell_id': cell_id, 'read_end': read_end, 'flowcell_id': flowcell_id, 'lane_number': lane_number, } if len(base_dirs) != 1: raise ValueError( f'found files in zero or multiple directories {base_dirs}') if len(sequence_lane_ids) != 1: raise ValueError( f'found zero or multiple lanes {sequence_lane_ids}') assert not sample_info['cell_id'].duplicated().any() metadata['meta']['cells'] = {} for idx, row in sample_info.iterrows(): metadata['meta']['cells'][row['cell_id']] = { 'library_id': row['library_id'], 'sample_id': row['sample_id'], 'pick_met': row['pick_met'], 'condition': row['condition'], 'sample_type': row['sample_type'], 'img_col': row['img_col'], 'row': row['row'], 'column': row['column'], 'primer_i5': row['primer_i5'], 'index_i5': row['index_i5'], 'primer_i7': row['primer_i7'], 'index_i7': row['index_i7'], 'index_sequence': row['index_sequence'], } metadata['meta']['lanes'] = { flowcell_id: { lane_number: { 'sequencing_centre': lane_datasets[0]['sequence_lanes'][0]['sequencing_centre'], 'sequencing_instrument': lane_datasets[0]['sequence_lanes'][0] ['sequencing_instrument'], 'sequencing_library_id': lane_datasets[0]['sequence_lanes'][0] ['sequencing_library_id'], 'read_type': lane_datasets[0]['sequence_lanes'][0]['read_type'], } } } dataset_info = { 'dataset_ids': dataset_ids, 'flowcell_id': flowcell_id, 'lane_number': lane_number, 'base_dir': list(base_dirs)[0], } yield dataset_info, metadata