def check_indices(library_id=None): tantalus_api = TantalusApi() colossus_api = ColossusApi() if library_id is None: library_ids = set([a['pool_id'] for a in colossus_api.list('library')]) else: library_ids = [library_id] for library_id in library_ids: # Get colossus sublibrary indices sublibraries = colossus_api.list('sublibraries', library__pool_id=library_id) colossus_indices = set( [a['primer_i7'] + '-' + a['primer_i5'] for a in sublibraries]) datasets = tantalus_api.list( 'sequence_dataset', library__library_id=library_id, library__library_type__name='SC_WGS', dataset_type='FQ', ) lane_datasets = collections.defaultdict(list) for dataset in datasets: assert len(dataset['sequence_lanes']) == 1 flowcell_lane = '_'.join([ dataset['sequence_lanes'][0]['flowcell_id'], dataset['sequence_lanes'][0]['lane_number'], ]) lane_datasets[flowcell_lane].append(dataset) for flowcell_lane in lane_datasets: # Get tantalus sublibraries and indices tantalus_indices = set() tantalus_dataset_ids = [] tantalus_sequencing_centre = set() for dataset in lane_datasets[flowcell_lane]: file_resources = list( tantalus_api.list('file_resource', sequencedataset__id=dataset['id'])) tantalus_indices.update( set([ a['sequencefileinfo']['index_sequence'] for a in file_resources ])) tantalus_dataset_ids.append(dataset['id']) tantalus_sequencing_centre.update([ a['sequencing_centre'] for a in dataset['sequence_lanes'] ]) assert len(tantalus_sequencing_centre) == 1 tantalus_sequencing_centre = list(tantalus_sequencing_centre)[0] if len(colossus_indices - tantalus_indices) > 0: print( 'library {}, datasets {}, lane {}, sequencing_centre {}: {} in colossus but not tantalus' .format(library_id, tantalus_dataset_ids, flowcell_lane, tantalus_sequencing_centre, len(colossus_indices - tantalus_indices))) if len(tantalus_indices - colossus_indices) > 0: print( 'library {}, datasets {}, lane {}, sequencing_centre {}: {} in tantalus but not colossus' .format(library_id, tantalus_dataset_ids, flowcell_lane, tantalus_sequencing_centre, len(tantalus_indices - colossus_indices))) if tantalus_indices == colossus_indices: print( 'library {}, datasets {}, lane {}, sequencing_centre {}: OK' .format(library_id, tantalus_dataset_ids, flowcell_lane, tantalus_sequencing_centre))
from dbclients.tantalus import TantalusApi from dbclients.colossus import ColossusApi import logging tantalus_api = TantalusApi() colossus_api = ColossusApi() if __name__ == '__main__': print "STARTING" colossus_analyses = colossus_api.list('analysis_information') tantalus_analyses = tantalus_api.list('analysis', analysis_type__name="align") analysis_lane_dict = {} for analysis in tantalus_analyses: lane_set = set() for input_dataset in analysis['input_datasets']: dataset = tantalus_api.get('sequencedataset',id=input_dataset) for lane in dataset['sequence_lanes']: lane_set.add(str(lane['flowcell_id'] + "_" + str(lane['lane_number']))) analysis_lane_dict[analysis['name']] = lane_set print analysis_lane_dict for analysis in colossus_analyses: key = analysis['analysis_jira_ticket'] + '_align' if key in analysis_lane_dict.keys(): lanes = []
def main(storage_name, dlp_library_id=None, internal_id=None, tag_name=None, all=False, update=False, check_library=False, dry_run=False): # Set up the root logger logging.basicConfig(format=LOGGING_FORMAT, stream=sys.stderr, level=logging.INFO) # Connect to the Tantalus API (this requires appropriate environment) colossus_api = ColossusApi() tantalus_api = TantalusApi() # initiate arrays to store successful and failed libraries successful_libs = [] failed_libs = [] storage = tantalus_api.get("storage", name=storage_name) sequencing_list = list() if dry_run: logging.info("This is a dry run. No lanes will be imported.") # Importing a single library if dlp_library_id is not None: sequencing_list = list( colossus_api.list('sequencing', sequencing_center='BCCAGSC', library__pool_id=dlp_library_id)) # importing all libraries from the gsc elif all: sequencing_list = list( colossus_api.list('sequencing', sequencing_center='BCCAGSC')) # importing only sequencing expecting more lanes else: sequencing_list = list( colossus_api.list('sequencing', sequencing_center='BCCAGSC')) sequencing_list = list( filter( lambda s: s['number_of_lanes_requested'] != len(s[ 'dlplane_set']), sequencing_list)) for sequencing in sequencing_list: # import library try: import_info = import_gsc_dlp_paired_fastqs( colossus_api, tantalus_api, sequencing, storage, internal_id, tag_name, update=update, check_library=check_library, dry_run=dry_run, ) # check if no import information exists, if so, library does not exist on GSC if import_info is None: lane_requested_date = sequencing["lane_requested_date"] failed_libs.append( dict( dlp_library_id=sequencing["library"], gsc_library_id="None", lane_requested_date=lane_requested_date, error="Doesn't exist on GSC", )) continue # check if library excluded from import elif import_info is False: continue # update lanes in sequencing update_colossus_lane(colossus_api, sequencing, import_info['lanes']) # get sequencing object again since sequencing may have with new info updated_sequencing = colossus_api.get("sequencing", id=sequencing["id"]) # check if lanes have been imported check_lanes(colossus_api, updated_sequencing, len(updated_sequencing["dlplane_set"])) # add lane_requested_date to import info for import status report import_info['lane_requested_date'] = sequencing[ 'lane_requested_date'] # add library to list of succesfully imported libraries successful_libs.append(import_info) # create jira ticket and analyses with new lanes and datasets create_tickets_and_analyses(import_info) except Exception as e: # add lane_requested_date to import info for import status report lane_requested_date = sequencing["lane_requested_date"] updated_sequencing = colossus_api.get("sequencing", id=sequencing["id"]) # add library to list of libraries that failed to import failed_libs.append( dict( dlp_library_id=sequencing["library"], gsc_library_id=updated_sequencing["gsc_library_id"], lane_requested_date=lane_requested_date, error=str(e), )) logging.exception( f"Library {sequencing['library']} failed to import: {e}") continue # Only write import statuses for bulk imports if all or dlp_library_id is None: # Sort lists by date in descending order successful_libs.sort( key=lambda x: datetime.datetime.strptime(x['lane_requested_date'], '%Y-%m-%d'), reverse=True, ) failed_libs.sort( key=lambda x: datetime.datetime.strptime(x['lane_requested_date'], '%Y-%m-%d'), reverse=True, ) # write import report write_import_statuses(successful_libs, failed_libs)