Esempio n. 1
0
def check_indices(library_id=None):
    tantalus_api = TantalusApi()
    colossus_api = ColossusApi()

    if library_id is None:
        library_ids = set([a['pool_id'] for a in colossus_api.list('library')])

    else:
        library_ids = [library_id]

    for library_id in library_ids:

        # Get colossus sublibrary indices
        sublibraries = colossus_api.list('sublibraries',
                                         library__pool_id=library_id)
        colossus_indices = set(
            [a['primer_i7'] + '-' + a['primer_i5'] for a in sublibraries])

        datasets = tantalus_api.list(
            'sequence_dataset',
            library__library_id=library_id,
            library__library_type__name='SC_WGS',
            dataset_type='FQ',
        )

        lane_datasets = collections.defaultdict(list)

        for dataset in datasets:

            assert len(dataset['sequence_lanes']) == 1

            flowcell_lane = '_'.join([
                dataset['sequence_lanes'][0]['flowcell_id'],
                dataset['sequence_lanes'][0]['lane_number'],
            ])

            lane_datasets[flowcell_lane].append(dataset)

        for flowcell_lane in lane_datasets:

            # Get tantalus sublibraries and indices
            tantalus_indices = set()
            tantalus_dataset_ids = []
            tantalus_sequencing_centre = set()
            for dataset in lane_datasets[flowcell_lane]:
                file_resources = list(
                    tantalus_api.list('file_resource',
                                      sequencedataset__id=dataset['id']))
                tantalus_indices.update(
                    set([
                        a['sequencefileinfo']['index_sequence']
                        for a in file_resources
                    ]))
                tantalus_dataset_ids.append(dataset['id'])
                tantalus_sequencing_centre.update([
                    a['sequencing_centre'] for a in dataset['sequence_lanes']
                ])

            assert len(tantalus_sequencing_centre) == 1
            tantalus_sequencing_centre = list(tantalus_sequencing_centre)[0]

            if len(colossus_indices - tantalus_indices) > 0:
                print(
                    'library {}, datasets {}, lane {}, sequencing_centre {}: {} in colossus but not tantalus'
                    .format(library_id, tantalus_dataset_ids, flowcell_lane,
                            tantalus_sequencing_centre,
                            len(colossus_indices - tantalus_indices)))

            if len(tantalus_indices - colossus_indices) > 0:
                print(
                    'library {}, datasets {}, lane {}, sequencing_centre {}: {} in tantalus but not colossus'
                    .format(library_id, tantalus_dataset_ids, flowcell_lane,
                            tantalus_sequencing_centre,
                            len(tantalus_indices - colossus_indices)))

            if tantalus_indices == colossus_indices:
                print(
                    'library {}, datasets {}, lane {}, sequencing_centre {}: OK'
                    .format(library_id, tantalus_dataset_ids, flowcell_lane,
                            tantalus_sequencing_centre))
from dbclients.tantalus import TantalusApi
from dbclients.colossus import ColossusApi
import logging


tantalus_api = TantalusApi()
colossus_api = ColossusApi()


if __name__ == '__main__':
    print "STARTING"
    colossus_analyses = colossus_api.list('analysis_information')
    tantalus_analyses = tantalus_api.list('analysis', analysis_type__name="align")

    analysis_lane_dict = {}

    for analysis in tantalus_analyses:
        lane_set = set()
        for input_dataset in analysis['input_datasets']:
            dataset = tantalus_api.get('sequencedataset',id=input_dataset)
            for lane in dataset['sequence_lanes']:
                lane_set.add(str(lane['flowcell_id'] + "_" + str(lane['lane_number'])))

        analysis_lane_dict[analysis['name']] = lane_set

    print analysis_lane_dict

    for analysis in colossus_analyses:
        key = analysis['analysis_jira_ticket'] + '_align'
        if key in analysis_lane_dict.keys():
            lanes = []
def main(storage_name,
         dlp_library_id=None,
         internal_id=None,
         tag_name=None,
         all=False,
         update=False,
         check_library=False,
         dry_run=False):

    # Set up the root logger
    logging.basicConfig(format=LOGGING_FORMAT,
                        stream=sys.stderr,
                        level=logging.INFO)

    # Connect to the Tantalus API (this requires appropriate environment)
    colossus_api = ColossusApi()
    tantalus_api = TantalusApi()

    # initiate arrays to store successful and failed libraries
    successful_libs = []
    failed_libs = []

    storage = tantalus_api.get("storage", name=storage_name)
    sequencing_list = list()

    if dry_run:
        logging.info("This is a dry run. No lanes will be imported.")

    # Importing a single library
    if dlp_library_id is not None:
        sequencing_list = list(
            colossus_api.list('sequencing',
                              sequencing_center='BCCAGSC',
                              library__pool_id=dlp_library_id))
    # importing all libraries from the gsc
    elif all:
        sequencing_list = list(
            colossus_api.list('sequencing', sequencing_center='BCCAGSC'))
    # importing only sequencing expecting more lanes
    else:
        sequencing_list = list(
            colossus_api.list('sequencing', sequencing_center='BCCAGSC'))
        sequencing_list = list(
            filter(
                lambda s: s['number_of_lanes_requested'] != len(s[
                    'dlplane_set']), sequencing_list))

    for sequencing in sequencing_list:
        # import library
        try:
            import_info = import_gsc_dlp_paired_fastqs(
                colossus_api,
                tantalus_api,
                sequencing,
                storage,
                internal_id,
                tag_name,
                update=update,
                check_library=check_library,
                dry_run=dry_run,
            )

            # check if no import information exists, if so, library does not exist on GSC
            if import_info is None:
                lane_requested_date = sequencing["lane_requested_date"]
                failed_libs.append(
                    dict(
                        dlp_library_id=sequencing["library"],
                        gsc_library_id="None",
                        lane_requested_date=lane_requested_date,
                        error="Doesn't exist on GSC",
                    ))
                continue

            # check if library excluded from import
            elif import_info is False:
                continue

            # update lanes in sequencing
            update_colossus_lane(colossus_api, sequencing,
                                 import_info['lanes'])
            # get sequencing object again since sequencing may have with new info
            updated_sequencing = colossus_api.get("sequencing",
                                                  id=sequencing["id"])
            # check if lanes have been imported
            check_lanes(colossus_api, updated_sequencing,
                        len(updated_sequencing["dlplane_set"]))

            # add lane_requested_date to import info for import status report
            import_info['lane_requested_date'] = sequencing[
                'lane_requested_date']

            # add library to list of succesfully imported libraries
            successful_libs.append(import_info)

            # create jira ticket and analyses with new lanes and datasets
            create_tickets_and_analyses(import_info)

        except Exception as e:
            # add lane_requested_date to import info for import status report
            lane_requested_date = sequencing["lane_requested_date"]
            updated_sequencing = colossus_api.get("sequencing",
                                                  id=sequencing["id"])
            # add library to list of libraries that failed to import
            failed_libs.append(
                dict(
                    dlp_library_id=sequencing["library"],
                    gsc_library_id=updated_sequencing["gsc_library_id"],
                    lane_requested_date=lane_requested_date,
                    error=str(e),
                ))

            logging.exception(
                f"Library {sequencing['library']} failed to import: {e}")
            continue

    # Only write import statuses for bulk imports
    if all or dlp_library_id is None:
        # Sort lists by date in descending order
        successful_libs.sort(
            key=lambda x: datetime.datetime.strptime(x['lane_requested_date'],
                                                     '%Y-%m-%d'),
            reverse=True,
        )
        failed_libs.sort(
            key=lambda x: datetime.datetime.strptime(x['lane_requested_date'],
                                                     '%Y-%m-%d'),
            reverse=True,
        )
        # write import report
        write_import_statuses(successful_libs, failed_libs)