Beispiel #1
0
 def initialize_data_from_source(self) -> List[drive.DriveObject]:
     """
     Initialize the data as a list of file handlers from the drive folder
     """
     return drive.get_contents_by_folder_id(self.drive_service,
                                            self.folder_id,
                                            only_files=True)
def get_already_tracked_samples(drive_service, cfg):
    """Return a list of processed qPCR barcodes by checking our marker file folder"""
    marker_folder_id = drive.get_folder_id_of_path(
        drive_service, cfg.ACCESSSION_TRACKING_MARKERS_FOLDER)

    marker_folder_contents = drive.get_contents_by_folder_id(drive_service,
                                                             marker_folder_id,
                                                             only_files=True)

    completed_barcodes = set(marker_folder_entry.name
                             for marker_folder_entry in marker_folder_contents)
    return completed_barcodes
Beispiel #3
0
def test_get_contents_by_folder_id_types(
    gdrive_service: DriveService,
    gdrive_folder: DriveObject,
    new_folder_name="test_get_contents_by_folder_id_types",
):
    """Verify that get_contents_by_folder_id can filter by only files correctly."""
    subdir = mkdir(gdrive_service, gdrive_folder.id, new_folder_name)

    # put a file and subdirectory there.
    with put_file(gdrive_service, subdir.id, "file") as fh:
        fh.write(b"this is a file")

    mkdir(gdrive_service, subdir.id, "another-subdir")

    results = get_contents_by_folder_id(gdrive_service, subdir.id, only_files=True)
    assert len(results) == 1
    assert results[0].name == "file"

    results = get_contents_by_folder_id(gdrive_service, subdir.id, only_files=False)
    assert len(results) == 2
    assert any(result.name == "file" for result in results)
    assert any(result.name == "another-subdir" for result in results)
Beispiel #4
0
def fetch_barcodes(args, cfg):
    google_credentials = gutils.get_secrets_manager_credentials(args.secret_id)
    drive_service = drive.get_service(google_credentials)

    # qpcr logs folder
    logs_folder_id = drive.get_folder_id_of_path(drive_service,
                                                 cfg.PCR_LOGS_FOLDER)
    logs_folder_contents = drive.get_contents_by_folder_id(drive_service,
                                                           logs_folder_id,
                                                           only_files=True)

    barcodes_to_fetch = defaultdict(RunFiles)
    for entry in logs_folder_contents:
        m = RunFiles.get_qpcr_file_type(entry.name)
        if m is None:
            continue
        elif m[RunFiles.BARCODE] in args.barcodes:
            barcodes_to_fetch[m[RunFiles.BARCODE]].add_file(m, entry)

    for barcode, barcode_files in barcodes_to_fetch.items():
        # all files must be present, at least one quant_amp file
        if not barcode_files.all_files:
            logger.warning(msg=f"Missing files for {barcode}!")
            continue

        logger.info(msg=f"Found sample to fetch: {barcode}")

        # read in the run information and quant cq
        run_info = barcode_files.run_info
        logger.info(msg=f"    Downloading: {run_info.name}")
        with drive.get_file(drive_service, run_info.id, binary=False) as fh:
            with (args.output_dir / run_info.name).open("w") as out:
                out.write(fh.read())

        quant_cq = barcode_files.quant_cq
        logger.info(msg=f"    Downloading: {quant_cq.name}")
        with drive.get_file(drive_service, quant_cq.id, binary=False) as fh:
            with (args.output_dir / quant_cq.name).open("w") as out:
                out.write(fh.read())

        for quant_amp in barcode_files.quant_amp.values():
            logger.info(msg=f"    Downloading: {quant_amp.name}")
            with drive.get_file(drive_service, quant_amp.id,
                                binary=False) as fh:
                with (args.output_dir / quant_amp.name).open("w") as out:
                    out.write(fh.read())
Beispiel #5
0
def test_get_contents_by_folder_id_paging(
    gdrive_service: DriveService,
    gdrive_folder: DriveObject,
    new_folder_name="test_get_contents_by_folder_id_paging",
):
    """Verify that get_contents_by_folder_id can iterate through pages of results correctly."""
    subdir = mkdir(gdrive_service, gdrive_folder.id, new_folder_name)

    # put two files there.
    with put_file(gdrive_service, subdir.id, "file0") as fh:
        fh.write(b"this is a file")
    with put_file(gdrive_service, subdir.id, "file1") as fh:
        fh.write(b"this is a file")

    # even with page_size=1, we should be able to retrieve all the results.
    results = get_contents_by_folder_id(
        gdrive_service, subdir.id, only_files=True, page_size=1
    )
    assert len(results) == 2
def get_accession_locations(drive_service, cfg) -> Dict[str, Tuple[str, str]]:
    """return a mapping between accession ID's and their origin location"""
    accession_locations = {}
    accession_location_folder_id = drive.get_folder_id_of_path(
        drive_service, cfg.ACCESSION_LOCATIONS_FOLDER)
    accession_location_files = drive.get_contents_by_folder_id(
        drive_service, accession_location_folder_id, only_files=True)
    for accession_location_file in accession_location_files:
        with drive.get_file(drive_service, accession_location_file.id) as fh:
            accession_location_reader = csv.reader(fh, delimiter=",")
            for row in accession_location_reader:
                if row[0] == "Accession":
                    # header row
                    continue
                submitter_id = ""
                if len(row) == 3:
                    accession, location, submitter_id = row
                else:
                    accession, location = row
                accession_locations[accession] = location, submitter_id
    return accession_locations
Beispiel #7
0
def test_put_overwrite_multiple(
    gdrive_service: DriveService,
    gdrive_folder: DriveObject,
    filename="test_put_overwrite_multiple.txt",
):
    """Test the case where we are overwriting and there are multiple files we
    could possibly overwrite.  It should overwrite the newest file."""
    put_request = put_file(gdrive_service, gdrive_folder.id, filename)
    with put_request as fh:
        fh.write("first")
    first_id = put_request.id

    put_request = put_file(gdrive_service,
                           gdrive_folder.id,
                           filename,
                           overwrite_if_present=False)
    with put_request as fh:
        fh.write("second")
    second_id = put_request.id

    put_request = put_file(gdrive_service,
                           gdrive_folder.id,
                           filename,
                           overwrite_if_present=True)
    with put_request as fh:
        fh.write("third")
    assert put_request.id == second_id

    listing = get_contents_by_folder_id(gdrive_service,
                                        gdrive_folder.id,
                                        only_files=True)
    matching_listings = [entry for entry in listing if entry.name == filename]
    assert len(matching_listings) == 2

    with get_file(gdrive_service, first_id, True) as fh:
        assert fh.read() == b"first"
    with get_file(gdrive_service, second_id, False) as fh:
        assert fh.read() == "third"
Beispiel #8
0
def test_put_no_overwrite(
    gdrive_service: DriveService,
    gdrive_folder: DriveObject,
    filename="test_put_no_overwrite.txt",
):
    """Puts a file, and then put it again with overwrite_if_present=False.
    Both files should be found."""
    put_request = put_file(gdrive_service, gdrive_folder.id, filename)
    with put_request as fh:
        fh.write("first")

    put_request = put_file(gdrive_service,
                           gdrive_folder.id,
                           filename,
                           overwrite_if_present=False)
    with put_request as fh:
        fh.write("second")

    listing = get_contents_by_folder_id(gdrive_service,
                                        gdrive_folder.id,
                                        only_files=True)
    matching_listings = [entry for entry in listing if entry.name == filename]
    assert len(matching_listings) == 2
def test_processing(gdrive_service: DriveService, gdrive_folder: DriveObject):
    """Test to validate the gdrive processing pipeline.  This test has four phases:
    1. Create directories on a test gdrive space for the files that are staged.
    2. Stage the files so that it mimics the production environment.  This includes
       the logs, the plate layout files, and the control layout files.
    3. Run the main processsing loop.
    4. Verify that the output csv files are correct.
    5. Verify that the expected markers and PDFs are present.

    If this test fails, verify that all the data necessary to run the pipeline is staged
    correctly."""

    ####################################################################################
    # STEP 1: Create directories on a test gdrive space for the files that are staged.

    # set up the test space
    cfg = AlternateGDriveConfig(gdrive_folder.name)

    # make the necessary input folders
    logs_folder_id = mkdir_recursive(gdrive_service, "root",
                                     cfg.PCR_LOGS_FOLDER)
    plate_layout_folder_id = mkdir_recursive(gdrive_service, "root",
                                             cfg.PLATE_LAYOUT_FOLDER)

    # make the necessary output folders
    mkdir_recursive(gdrive_service, "root", cfg.CSV_RESULTS_FOLDER)
    cb_csv_folder_id = mkdir_recursive(gdrive_service, "root",
                                       cfg.CHINA_BASIN_CSV_REPORTS_FOLDER)
    final_results_folder_id = mkdir_recursive(gdrive_service, "root",
                                              cfg.FINAL_REPORTS_FOLDER)
    markers_folder_id = mkdir_recursive(gdrive_service, "root",
                                        cfg.PCR_MARKERS_FOLDER)

    ####################################################################################
    # STEP 2: Stage the files.

    # copy all the files to the appropriate places
    for filename in LOG_FILES:
        mode = f"r{file_mode(filename)}"
        with (EXAMPLE_FILE_DIR / filename).open(mode) as src_fh, put_file(
                gdrive_service, logs_folder_id, filename) as dst_fh:
            dst_fh.write(src_fh.read())

    for filename in PLATE_LAYOUT_FILES:
        mode = f"r{file_mode(filename)}"
        with (EXAMPLE_FILE_DIR / filename).open(mode) as src_fh, put_file(
                gdrive_service, plate_layout_folder_id, filename) as dst_fh:
            dst_fh.write(src_fh.read())

    ####################################################################################
    # STEP 3: Run the processing pipeline

    processing(cfg, credentials_for_tests())

    ####################################################################################
    # STEP 4: Verify the csv files.

    for remote_filename, local_filename in EXPECTED_CSV_FILES.items():
        with (EXAMPLE_FILE_DIR /
              local_filename).open("r") as t1, get_file_by_name(
                  gdrive_service, cb_csv_folder_id, remote_filename) as t2:
            rdr1 = csv.reader(t1)
            rdr2 = csv.reader(t2)
            for row1, row2, in zip(rdr1, rdr2):
                assert (
                    row1 == row2
                ), f"mismatch between {local_filename} and {remote_filename}"

    ####################################################################################
    # STEP 5: Verify that markers and PDFs were created and uploaded

    marker_folder_contents = get_contents_by_folder_id(gdrive_service,
                                                       markers_folder_id,
                                                       only_files=True)
    marker_set = {
        marker_folder_entry.name
        for marker_folder_entry in marker_folder_contents
    }

    assert marker_set == EXPECTED_MARKERS

    pdf_folder_contents = get_contents_by_folder_id(gdrive_service,
                                                    final_results_folder_id,
                                                    only_files=True)
    pdf_report_set = {
        pdf_folder_entry.name
        for pdf_folder_entry in pdf_folder_contents
    }

    assert pdf_report_set == EXPECTED_PDFS
def get_accession_folder_contents(drive_service, accession_folder_id):
    """Return the contents of the WellLit accessions folder (also includes Hamilton files)"""
    return drive.get_contents_by_folder_id(drive_service,
                                           accession_folder_id,
                                           only_files=True)
Beispiel #11
0
def processing(cfg: Config, google_credentials: service_account.Credentials):
    git_info = get_git_info()
    drive_service = drive.get_service(google_credentials)
    logger.info(msg=f"Starting processing loop with code version: {git_info}")

    # qpcr logs folder
    logs_folder_id = drive.get_folder_id_of_path(drive_service,
                                                 cfg.PCR_LOGS_FOLDER)

    # markers folder
    markers_folder_id = drive.get_folder_id_of_path(drive_service,
                                                    cfg.PCR_MARKERS_FOLDER)

    # csv results folder
    csv_results_folder_id = drive.get_folder_id_of_path(
        drive_service, cfg.CSV_RESULTS_FOLDER)

    # CB rad results folder
    cb_report_folder_id = drive.get_folder_id_of_path(
        drive_service, cfg.CHINA_BASIN_CSV_REPORTS_FOLDER)

    # final reports folder
    final_results_folder_id = drive.get_folder_id_of_path(
        drive_service, cfg.FINAL_REPORTS_FOLDER)

    # get the collection spreadsheet
    collective_form = CollectiveForm(
        drive_service, cfg["DATA"]["collection_form_spreadsheet_id"])

    logs_folder_contents = drive.get_contents_by_folder_id(drive_service,
                                                           logs_folder_id,
                                                           only_files=True)
    marker_folder_contents = drive.get_contents_by_folder_id(drive_service,
                                                             markers_folder_id,
                                                             only_files=True)
    plate_layout_folder_id = drive.get_folder_id_of_path(
        drive_service, cfg.PLATE_LAYOUT_FOLDER)
    completed_barcodes = set(marker_folder_entry.name
                             for marker_folder_entry in marker_folder_contents)

    sample_metadata_form = collective_form[SampleMetadata.SHEET_NAME]
    rerun_form = collective_form[SampleRerun.SHEET_NAME]

    # group log file entries by barcode
    logger.info(msg="Checking for samples to process")

    barcodes_to_process = defaultdict(RunFiles)
    for entry in logs_folder_contents:
        m = RunFiles.get_qpcr_file_type(entry.name)
        if m is None or m[RunFiles.BARCODE] in completed_barcodes:
            continue
        else:
            barcodes_to_process[m[RunFiles.BARCODE]].add_file(m, entry)

    for barcode, barcode_files in barcodes_to_process.items():
        # all files must be present, at least one quant_amp file
        if not barcode_files.all_files:
            message = f"Missing files for: {barcode}. Skipping for now"
            logger.critical(msg=message, extra={"notify_slack": True})
            continue

        try:
            logger.info(msg=f"Found sample to process, barcode: {barcode}")

            logger.info(msg=f"Getting metadata and data for: {barcode}")
            bravo_metadata = BravoMetadata.load_from_spreadsheet(
                barcode,
                collective_form,
            )
            if bravo_metadata.sop_protocol is None:
                message = f"Skipping sample plate: {barcode}, no protocol"
                logger.critical(msg=message, extra={"notify_slack": True})
                continue

            protocol = get_protocol(bravo_metadata.sop_protocol)

            if not set(barcode_files.quant_amp).issuperset(protocol.mapping):
                missing = map(
                    str,
                    set(protocol.mapping) - set(barcode_files.quant_amp))
                message = f"Missing quant amp files for {barcode}: {', '.join(missing)}"
                logger.critical(msg=message, extra={"notify_slack": True})
                continue

            # process well data and check controls, return results
            logger.info(
                msg=f"Processing well data and controls for: {barcode}")
            accession_data = accession.get_accession_data_with_rerun(
                drive_service,
                plate_layout_folder_id,
                sample_metadata_form,
                rerun_form,
                bravo_metadata.sample_barcode,
            )

            control_wells = get_control_wells_from_type(
                controls_type=bravo_metadata.controls_type,
                accession_data=accession_data,
            )
            update_accession_data_with_controls(control_wells, accession_data,
                                                barcode)

            processing_results = process_barcode(
                cfg,
                barcode,
                barcode_files,
                bravo_metadata,
                protocol,
                control_wells,
                accession_data,
            )

            with drive.put_file(
                    drive_service,
                    csv_results_folder_id,
                    processing_results.results_filename,
            ) as fh:
                processing_results.write_results(fh)

            china_basin_result_file = drive.put_file(
                drive_service,
                cb_report_folder_id,
                processing_results.cb_report_filename,
            )
            with china_basin_result_file as fh:
                processing_results.write_cb_report(fh)

            # create pdf report
            logger.info(
                msg=f"Generating and uploading results PDF for: {barcode}")
            final_pdf = io.BytesIO()
            create_final_pdf(processing_results, final_pdf)
            pdf_results_file = drive.put_file(
                drive_service,
                final_results_folder_id,
                processing_results.final_pdf_filename,
            )
            with pdf_results_file as out_fh:
                out_fh.write(final_pdf.getvalue())

            logger.info(msg=f"Sending email report: {barcode}")
            mail.send_email(
                google_credentials,
                sender=cfg["EMAIL"].get("sender"),
                recipients=cfg["EMAIL"].get("recipients"),
                subject=_format_email_subject(
                    sample_barcode=bravo_metadata.sample_barcode,
                    qpcr_barcode=barcode,
                ),
                body=_format_email_body(
                    sample_barcode=bravo_metadata.sample_barcode,
                    results_file_id=china_basin_result_file.id,
                ),
                attachments={processing_results.final_pdf_filename: final_pdf},
            )

            message = (
                f"Processed sample plate: {bravo_metadata.sample_barcode}-{barcode}"
                f" using rev {git_info}")
            logger.critical(msg=message, extra={"notify_slack": True})
            # write a marker so we don't process this file again.
            processing_results.write_marker_file(drive_service,
                                                 markers_folder_id)

        except Exception as err:
            logger.critical(f"Error in [{cfg.aws_env}]: {err}",
                            extra={"notify_slack": True})
            logger.exception("Details:")
Beispiel #12
0
def parse_qpcr_csv(args):
    cfg = Config()
    create_logger(cfg, debug=args.debug)

    logger.info(msg=f"Started local processing in: {args.qpcr_run_path}")

    if args.use_gdrive and not args.barcodes:
        raise ValueError(
            "You must specify barcodes to process from Google Drive")

    run_path = pathlib.Path(args.qpcr_run_path)

    google_credentials = gutils.get_secrets_manager_credentials(args.secret_id)

    drive_service = drive.get_service(google_credentials)
    collective_form = CollectiveForm(
        drive_service, cfg["DATA"]["collection_form_spreadsheet_id"])

    sample_metadata_form = collective_form[SampleMetadata.SHEET_NAME]
    rerun_form = collective_form[SampleRerun.SHEET_NAME]

    if args.use_gdrive:
        logs_folder_id = drive.get_folder_id_of_path(drive_service,
                                                     cfg.PCR_LOGS_FOLDER)
        logs_folder_contents = [
            drive_file for drive_file in drive.get_contents_by_folder_id(
                drive_service, logs_folder_id, only_files=True)
        ]

        plate_layout_folder_id = drive.get_folder_id_of_path(
            drive_service, cfg.PLATE_LAYOUT_FOLDER)
    else:
        logs_folder_contents = run_path.glob("*.csv")

    barcodes_to_process = defaultdict(RunFiles)
    for run_file in logs_folder_contents:
        m = RunFiles.get_qpcr_file_type(run_file.name)
        if m is None:
            continue
        elif args.barcodes and m[RunFiles.BARCODE] not in args.barcodes:
            continue
        else:
            barcodes_to_process[m[RunFiles.BARCODE]].add_file(m, run_file)

    for barcode, barcode_files in barcodes_to_process.items():
        # all files must be present, at least one quant_amp file
        if not barcode_files.all_files:
            message = f"Missing files for: {barcode}. Skipping for now"
            logger.info(msg=message)
            continue

        logger.info(msg=f"Found sample to process, barcode: {barcode}")

        logger.info(msg=f"Getting metadata and data for: {barcode}")
        bravo_metadata = BravoMetadata.load_from_spreadsheet(
            barcode, collective_form)
        if args.protocol is not None:
            # user specified the protocol
            protocol = get_protocol(args.protocol)
        else:
            protocol = get_protocol(bravo_metadata.sop_protocol)

        if not set(barcode_files.quant_amp).issuperset(protocol.mapping):
            missing = map(str,
                          set(protocol.mapping) - set(barcode_files.quant_amp))
            message = f"Missing quant amp files for {barcode}: {', '.join(missing)}"
            logger.critical(msg=message)
            continue

        if args.plate_map_file is not None:
            plate_map_type = accession.get_plate_map_type_from_name(
                args.plate_map_file.name)
            accession_data = accession.read_accession_data(
                plate_map_type, args.plate_map_file)
        elif args.use_gdrive:
            accession_data = accession.get_accession_data_with_rerun(
                drive_service,
                plate_layout_folder_id,
                sample_metadata_form,
                rerun_form,
                bravo_metadata.sample_barcode,
            )
        else:
            raise ValueError(
                "You must provide a plate map file or use Google Drive")

        control_wells = get_control_wells_from_type(
            controls_type=bravo_metadata.controls_type,
            accession_data=accession_data,
        )
        # check for valid accessions
        update_accession_data_with_controls(control_wells, accession_data,
                                            barcode)

        # process well data and check controls, return results
        logger.info(msg=f"Processing well data and controls for: {barcode}")

        processing_results = process_barcode(
            cfg,
            barcode,
            barcode_files,
            bravo_metadata,
            protocol,
            control_wells,
            accession_data,
        )

        with (run_path / processing_results.results_filename).open("w") as fh:
            processing_results.write_results(fh)

        with (run_path /
              processing_results.cb_report_filename).open("w") as fh:
            processing_results.write_cb_report(fh)

        # create pdf report
        logger.info(msg=f"Generating results PDF for: {barcode}")
        final_pdf_filename = run_path / processing_results.final_pdf_filename
        with open(final_pdf_filename, "wb") as output_file:
            create_final_pdf(processing_results, output_file)