Ejemplo n.º 1
0
def generate_sample_data_content(files, pipeline_name, pipeline_github,
                                 pipeline_version):
    result = "SAMPLE_ID\tREQUEST_ID\tPROJECT_ID\tPATIENT_ID\tCOLLAB_ID\tSAMPLE_TYPE\tGENE_PANEL\tONCOTREE_CODE\tSAMPLE_CLASS\tSPECIMEN_PRESERVATION_TYPE\tSEX\tTISSUE_SITE\tIGO_ID\tPIPELINE\tPIPELINE_GITHUB_LINK\tPIPELINE_VERSION\n"
    ret_str = 'metadata__sampleId'
    query = Q(file__file_group_id=settings.IMPORT_FILE_GROUP)
    query |= Q(file__file_group__slug="origin-unknown")
    query = query & Q(file__path__in=files)
    samples = FileRepository.filter(
        q=query).order_by(ret_str).distinct(ret_str).all()
    for sample in samples:
        metadata = sample.metadata
        result += '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
            metadata.get(
                'cmoSampleName',
                format_sample_name(metadata['sampleName'],
                                   metadata['specimenType'])),
            metadata['requestId'],
            get_project_id(metadata['requestId']),
            metadata['patientId'],
            metadata['investigatorSampleId'],
            MetadataValidator.clean_value(metadata['sampleClass']),
            MetadataValidator.clean_value(metadata['recipe']),
            MetadataValidator.clean_value(metadata['oncoTreeCode']),
            MetadataValidator.clean_value(metadata['specimenType']),
            MetadataValidator.clean_value(metadata['preservation']),
            MetadataValidator.clean_value(metadata['sex']),
            MetadataValidator.clean_value(metadata['tissueLocation']),
            metadata['sampleId'],
            pipeline_name,
            pipeline_github,
            pipeline_version,
        )
    return result
Ejemplo n.º 2
0
def format_metadata(original_metadata):
    metadata = dict()
    original_metadata_copy = copy.deepcopy(original_metadata)
    sample_name = original_metadata_copy.pop("cmoSampleName", None)
    external_sample_name = original_metadata_copy.pop("sampleName", None)
    sample_id = original_metadata_copy.pop("igoId", None)
    patient_id = original_metadata_copy.pop("cmoPatientId", None)
    sample_class = original_metadata_copy.pop("cmoSampleClass", None)
    specimen_type = original_metadata_copy.pop("specimenType", None)
    # ciTag is the new field which needs to be used for the operators
    metadata["ciTag"] = format_sample_name(sample_name, specimen_type)
    metadata["cmoSampleName"] = format_sample_name(sample_name, specimen_type)
    metadata["specimenType"] = specimen_type
    metadata["sampleName"] = sample_name
    metadata["externalSampleId"] = external_sample_name
    metadata["sampleId"] = sample_id
    metadata["patientId"] = format_patient_id(patient_id)
    metadata["sampleClass"] = sample_class
    metadata["sequencingCenter"] = "MSKCC"
    metadata["platform"] = "Illumina"
    metadata["libraryId"] = original_metadata_copy.pop("libraryIgoId", None)
    for k, v in original_metadata_copy.items():
        metadata[k] = v
    return metadata
Ejemplo n.º 3
0
def format_metadata(original_metadata):
    metadata = dict()
    original_metadata_copy = copy.deepcopy(original_metadata)
    sample_name = original_metadata_copy.pop('cmoSampleName', None)
    external_sample_name = original_metadata_copy.pop('sampleName', None)
    sample_id = original_metadata_copy.pop('igoId', None)
    patient_id = original_metadata_copy.pop('cmoPatientId', None)
    sample_class = original_metadata_copy.pop('cmoSampleClass', None)
    specimen_type = original_metadata_copy.pop('specimenType', None)
    # ciTag is the new field which needs to be used for the operators
    metadata['ciTag'] = format_sample_name(sample_name, specimen_type)
    metadata['cmoSampleName'] = format_sample_name(sample_name, specimen_type)
    metadata['specimenType'] = specimen_type
    metadata['sampleName'] = sample_name
    metadata['externalSampleId'] = external_sample_name
    metadata['sampleId'] = sample_id
    metadata['patientId'] = format_patient_id(patient_id)
    metadata['sampleClass'] = sample_class
    metadata['sequencingCenter'] = 'MSKCC'
    metadata['platform'] = 'Illumina'
    metadata['libraryId'] = original_metadata_copy.pop('libraryIgoId', None)
    for k, v in original_metadata_copy.items():
        metadata[k] = v
    return metadata
Ejemplo n.º 4
0
def build_sample(data, ignore_sample_formatting=False):
    """
    Given some data - which is a list of samples originally from the LIMS, split up into one file
    per index - the data is then compiled into one sample dictionary consisting of one or more
    pairs of fastqs

    Note that ID and SM are different field values in ARGOS (RG_ID and ID, respectively, in ARGOS)
    but standardizing it here with what GATK sets bam headers to
    """

    samples = dict()

    for value in data:
        meta = value['metadata']
        bid = value['id']
        sequencing_center = meta['sequencingCenter']
        platform = meta['platform']
        request_id = meta['requestId']
        fpath = value['path']
        sample_id = meta['sampleId']
        library_id = meta['libraryId']
        bait_set = meta['baitSet']
        tumor_type = meta['tumorOrNormal']
        specimen_type = meta['specimenType']
        species = meta['species']
        cmo_sample_name = format_sample_name(meta['sampleName'], specimen_type,
                                             ignore_sample_formatting)
        if cmo_sample_name == "sampleNameMalformed":
            LOGGER.error("sampleName for %s is malformed", sample_id)
        flowcell_id = meta['flowCellId']
        barcode_index = meta['barcodeIndex']
        cmo_patient_id = meta['patientId']
        platform_unit = flowcell_id
        run_date = meta['runDate']
        r_orientation = meta['R']
        pi_name = meta['labHeadName']
        pi_email = meta['labHeadEmail']
        run_id = meta['runId']
        preservation_type = meta['preservation']
        rg_id = cmo_sample_name + "_1"
        if barcode_index:
            platform_unit = '_'.join([flowcell_id, barcode_index])
        try:
            rg_id = '_'.join([cmo_sample_name, platform_unit])
        except:
            LOGGER.info("RG ID couldn't be set.")
            LOGGER.info("Sample ID %s; patient ID %s", sample_id,
                        cmo_patient_id)
            LOGGER.info("SampleName %s; platform unit %s", cmo_sample_name,
                        platform_unit)
        if rg_id not in samples:
            samples[rg_id] = dict()
            sample = dict()
            sample['CN'] = (sequencing_center)
            sample['PL'] = (platform)
            sample['PU'] = (platform_unit)
            sample['LB'] = (library_id)
            sample['tumor_type'] = (tumor_type)
            sample['ID'] = (rg_id)
            sample['SM'] = (cmo_sample_name)
            sample['species'] = (species)
            sample['patient_id'] = cmo_patient_id
            sample['bait_set'] = bait_set
            sample['sample_id'] = sample_id
            sample['run_date'] = run_date
            sample['specimen_type'] = specimen_type
            sample['request_id'] = request_id
            sample['pi'] = pi_name
            sample['pi_email'] = pi_email
            sample['run_id'] = run_id
            sample['preservation_type'] = preservation_type
            sample['R1'] = list()
            sample['R1_bid'] = list()
            sample['R2'] = list()
            sample['R2_bid'] = list()
        else:
            sample = samples[rg_id]

        # fastq pairing assumes flowcell id + barcode index are unique per run
        if 'R1' in r_orientation:
            sample['R1'].append(fpath)
            sample['R1_bid'].append(bid)
        elif 'R2' in r_orientation:
            sample['R2'].append(fpath)
            sample['R2_bid'].append(bid)
        else:
            sample['bam'] = fpath
            sample['bam_bid'] = bid
        samples[rg_id] = sample
    check_samples(samples)

    result = dict()
    result['CN'] = list()
    result['PL'] = list()
    result['PU'] = list()
    result['LB'] = list()
    result['tumor_type'] = list()
    result['ID'] = list()
    result['SM'] = list()
    result['species'] = list()
    result['patient_id'] = list()
    result['bait_set'] = list()
    result['sample_id'] = list()
    result['run_date'] = list()
    result['specimen_type'] = list()
    result['R1'] = list()
    result['R2'] = list()
    result['R1_bid'] = list()
    result['R2_bid'] = list()
    result['bam'] = list()
    result['bam_bid'] = list()
    result['request_id'] = list()
    result['pi'] = list()
    result['pi_email'] = list()
    result['run_id'] = list()
    result['preservation_type'] = list()

    for rg_id in samples:
        sample = samples[rg_id]
        for key in sample:
            if 'R1' in key or 'R2' in key:
                for i in sample[key]:
                    result[key].append(i)
            else:
                result[key].append(sample[key])
    result = check_and_return_single_values(result)

    return result
Ejemplo n.º 5
0
def fetch_samples(
    request_id,
    import_pooled_normals=True,
    import_samples=True,
    job_group=None,
    job_group_notifier=None,
    redelivery=False,
):
    logger.info("Fetching sampleIds for requestId:%s" % request_id)
    jg = None
    jgn = None
    try:
        jg = JobGroup.objects.get(id=job_group)
        logger.debug("JobGroup found")
    except JobGroup.DoesNotExist:
        logger.debug("No JobGroup Found")
    try:
        jgn = JobGroupNotifier.objects.get(id=job_group_notifier)
        logger.debug("JobGroup found")
    except JobGroupNotifier.DoesNotExist:
        logger.debug("No JobGroup Found")
    children = set()
    sample_ids = LIMSClient.get_request_samples(request_id)
    if sample_ids["requestId"] != request_id:
        raise ErrorInconsistentDataException(
            "LIMS returned wrong response for request %s. Got %s instead" %
            (request_id, sample_ids["requestId"]))
    request_metadata = {
        "dataAnalystEmail": sample_ids["dataAnalystEmail"],
        "dataAnalystName": sample_ids["dataAnalystName"],
        "investigatorEmail": sample_ids["investigatorEmail"],
        "investigatorName": sample_ids["investigatorName"],
        "labHeadEmail": sample_ids["labHeadEmail"],
        "labHeadName": sample_ids["labHeadName"],
        "otherContactEmails": sample_ids["otherContactEmails"],
        "dataAccessEmails": sample_ids["dataAccessEmails"],
        "qcAccessEmails": sample_ids["qcAccessEmails"],
        "projectManagerName": sample_ids["projectManagerName"],
        "recipe": sample_ids["recipe"],
        "piEmail": sample_ids["piEmail"],
    }
    set_recipe_event = ETLSetRecipeEvent(job_group_notifier,
                                         request_metadata["recipe"]).to_dict()
    send_notification.delay(set_recipe_event)
    pooled_normals = sample_ids.get("pooledNormals", [])
    if import_pooled_normals and pooled_normals:
        for f in pooled_normals:
            job = get_or_create_pooled_normal_job(f,
                                                  jg,
                                                  jgn,
                                                  redelivery=redelivery)
            children.add(str(job.id))
    if import_samples:
        if not sample_ids.get("samples", False):
            raise FailedToFetchSampleException(
                "No samples reported for requestId: %s" % request_id)

        for sample in sample_ids.get("samples", []):
            sampleMetadata = LIMSClient.get_sample_manifest(
                sample["igoSampleId"])
            try:
                data = sampleMetadata[0]
            except Exception as e:
                pass
            patient_id = format_patient_id(data.get("cmoPatientId"))

            if not Patient.objects.filter(patient_id=patient_id):
                Patient.objects.create(patient_id=patient_id)

            sample_name = data.get("cmoSampleName", None)
            specimen_type = data.get("specimenType", None)
            cmo_sample_name = format_sample_name(sample_name, specimen_type)

            if not Sample.objects.filter(sample_id=sample["igoSampleId"],
                                         sample_name=sample_name,
                                         cmo_sample_name=cmo_sample_name):
                Sample.objects.create(sample_id=sample["igoSampleId"],
                                      sample_name=sample_name,
                                      cmo_sample_name=cmo_sample_name)

            job = create_sample_job(sample["igoSampleId"],
                                    sample["igoComplete"], request_id,
                                    request_metadata, redelivery, jg, jgn)
            children.add(str(job.id))
    return list(children)
Ejemplo n.º 6
0
def build_sample(data, ignore_sample_formatting=False):
    """
    Given some data - which is a list of samples originally from the LIMS, split up into one file
    per index - the data is then compiled into one sample dictionary consisting of one or more
    pairs of fastqs

    Note that ID and SM are different field values in ARGOS (RG_ID and ID, respectively, in ARGOS)
    but standardizing it here with what GATK sets bam headers to
    """

    samples = dict()

    for value in data:
        meta = value["metadata"]
        bid = value["id"]
        sequencing_center = meta["sequencingCenter"]
        platform = meta["platform"]
        request_id = meta["requestId"]
        fpath = value["path"]
        sample_id = meta["sampleId"]
        library_id = meta["libraryId"]
        bait_set = meta["baitSet"]
        tumor_type = meta["tumorOrNormal"]
        specimen_type = meta["specimenType"]
        species = meta["species"]
        cmo_sample_name = format_sample_name(meta["sampleName"], specimen_type,
                                             ignore_sample_formatting)
        if cmo_sample_name == "sampleNameMalformed":
            LOGGER.error("sampleName for %s is malformed", sample_id)
        flowcell_id = meta["flowCellId"]
        barcode_index = meta["barcodeIndex"]
        cmo_patient_id = meta["patientId"]
        platform_unit = flowcell_id
        run_date = meta["runDate"]
        r_orientation = meta["R"]
        pi_name = meta["labHeadName"]
        pi_email = meta["labHeadEmail"]
        run_id = meta["runId"]
        preservation_type = meta["preservation"]
        rg_id = cmo_sample_name + "_1"
        if barcode_index:
            platform_unit = "_".join([flowcell_id, barcode_index])
        try:
            rg_id = "_".join([cmo_sample_name, platform_unit])
        except:
            LOGGER.info("RG ID couldn't be set.")
            LOGGER.info("Sample ID %s; patient ID %s", sample_id,
                        cmo_patient_id)
            LOGGER.info("SampleName %s; platform unit %s", cmo_sample_name,
                        platform_unit)
        if rg_id not in samples:
            samples[rg_id] = dict()
            sample = dict()
            sample["CN"] = sequencing_center
            sample["PL"] = platform
            sample["PU"] = platform_unit
            sample["LB"] = library_id
            sample["tumor_type"] = tumor_type
            sample["ID"] = rg_id
            sample["SM"] = cmo_sample_name
            sample["species"] = species
            sample["patient_id"] = cmo_patient_id
            sample["bait_set"] = bait_set
            sample["sample_id"] = sample_id
            sample["run_date"] = run_date
            sample["specimen_type"] = specimen_type
            sample["request_id"] = request_id
            sample["pi"] = pi_name
            sample["pi_email"] = pi_email
            sample["run_id"] = run_id
            sample["preservation_type"] = preservation_type
            sample["R1"] = list()
            sample["R1_bid"] = list()
            sample["R2"] = list()
            sample["R2_bid"] = list()
        else:
            sample = samples[rg_id]

        # fastq pairing assumes flowcell id + barcode index are unique per run
        if "R1" in r_orientation:
            sample["R1"].append(fpath)
            sample["R1_bid"].append(bid)
        elif "R2" in r_orientation:
            sample["R2"].append(fpath)
            sample["R2_bid"].append(bid)
        else:
            sample["bam"] = fpath
            sample["bam_bid"] = bid
        samples[rg_id] = sample
    check_samples(samples)

    result = dict()
    result["CN"] = list()
    result["PL"] = list()
    result["PU"] = list()
    result["LB"] = list()
    result["tumor_type"] = list()
    result["ID"] = list()
    result["SM"] = list()
    result["species"] = list()
    result["patient_id"] = list()
    result["bait_set"] = list()
    result["sample_id"] = list()
    result["run_date"] = list()
    result["specimen_type"] = list()
    result["R1"] = list()
    result["R2"] = list()
    result["R1_bid"] = list()
    result["R2_bid"] = list()
    result["bam"] = list()
    result["bam_bid"] = list()
    result["request_id"] = list()
    result["pi"] = list()
    result["pi_email"] = list()
    result["run_id"] = list()
    result["preservation_type"] = list()

    for rg_id in samples:
        sample = samples[rg_id]
        for key in sample:
            if "R1" in key or "R2" in key:
                for i in sample[key]:
                    result[key].append(i)
            else:
                result[key].append(sample[key])
    result = check_and_return_single_values(result)

    return result
Ejemplo n.º 7
0
def build_sample(data, ignore_sample_formatting=False):
    """
    Given some data - which is a list of samples originally from the LIMS, split up into one file
    per index - the data is then compiled into one sample dictionary consisting of one or more
    pairs of fastqs

    Note that ID and SM are different field values in ARGOS (RG_ID and ID, respectively, in ARGOS)
    but standardizing it here with what GATK sets bam headers to
    """

    samples = dict()

    for value in data:
        fpath = value["path"]
        curr_file = get_file(fpath)
        meta = value["metadata"]
        bid = value["id"]
        sequencing_center = meta["sequencingCenter"]
        platform = meta["platform"]
        request_id = meta["requestId"]
        sample_id = meta["sampleId"]
        library_id = meta["libraryId"]
        bait_set = meta["baitSet"]
        tumor_type = meta["tumorOrNormal"]
        specimen_type = meta["specimenType"]
        species = meta["species"]
        cmo_sample_name = format_sample_name(meta["sampleName"], specimen_type,
                                             ignore_sample_formatting)
        if cmo_sample_name == "sampleNameMalformed":
            LOGGER.error("sampleName for %s is malformed", sample_id)
        flowcell_id = meta["flowCellId"]
        barcode_index = meta["barcodeIndex"]
        cmo_patient_id = meta["patientId"]
        platform_unit = flowcell_id
        run_date = meta["runDate"]
        r_orientation = meta["R"]
        pi_name = meta["labHeadName"]
        pi_email = meta["labHeadEmail"]
        run_id = meta["runId"]
        preservation_type = meta["preservation"]
        rg_id = cmo_sample_name + "_1"
        run_mode = get_run_mode(meta["runMode"])
        if barcode_index:
            platform_unit = "_".join([flowcell_id, barcode_index])
        try:
            rg_id = "_".join([cmo_sample_name, platform_unit])
        except:
            LOGGER.info("RG ID couldn't be set.")
            LOGGER.info("Sample ID %s; patient ID %s", sample_id,
                        cmo_patient_id)
            LOGGER.info("SampleName %s; platform unit %s", cmo_sample_name,
                        platform_unit)
        if sample_id not in samples:
            samples[sample_id] = dict()
            sample = dict()
            sample["CN"] = sequencing_center
            sample["PL"] = platform
            sample["PU"] = list()
            sample["LB"] = library_id
            sample["tumor_type"] = tumor_type
            sample["SM"] = cmo_sample_name
            sample["species"] = species
            sample["patient_id"] = cmo_patient_id
            sample["bait_set"] = bait_set
            sample["sample_id"] = sample_id
            sample["run_date"] = run_date
            sample["specimen_type"] = specimen_type
            sample["request_id"] = request_id
            sample["pi"] = pi_name
            sample["pi_email"] = pi_email
            sample["run_id"] = run_id
            sample["preservation_type"] = preservation_type
            sample["ID"] = list()
            sample["R1"] = list()
            sample["R1_bid"] = list()
            sample["R2"] = list()
            sample["R2_bid"] = list()
            sample["fastqs"] = list()
            sample["run_mode"] = run_mode
        else:
            sample = samples[sample_id]

        # Queueing up fastqs for pairing later; RG ID and PU
        # will be assigned based on Fastqs object
        if "R1" in r_orientation or "R2" in r_orientation:
            sample["fastqs"].append(curr_file)
        else:
            # DMP bams found; assigning RG ID and PU here
            # There will always be only one DMP bam, so assign explicitly
            sample["bam"] = fpath
            sample["bam_bid"] = bid
            sample["PU"] = platform_unit
            sample["ID"] = rg_id
        samples[sample_id] = sample

    result = dict()
    result["CN"] = list()
    result["PL"] = list()
    result["PU"] = list()
    result["LB"] = list()
    result["tumor_type"] = list()
    result["ID"] = list()
    result["SM"] = list()
    result["species"] = list()
    result["patient_id"] = list()
    result["bait_set"] = list()
    result["sample_id"] = list()
    result["run_date"] = list()
    result["specimen_type"] = list()
    result["R1"] = list()
    result["R2"] = list()
    result["R1_bid"] = list()
    result["R2_bid"] = list()
    result["bam"] = list()
    result["bam_bid"] = list()
    result["request_id"] = list()
    result["pi"] = list()
    result["pi_email"] = list()
    result["run_id"] = list()
    result["preservation_type"] = list()
    result["run_mode"] = list()

    for sample_id in samples:
        sample = samples[sample_id]
        for key in sample:
            if key == "fastqs":
                if sample["fastqs"]:
                    fastqs = Fastqs(sample["SM"], sample["fastqs"])
                    result["R1"] = fastqs.r1
                    result["R1_bid"] = fastqs.r1_bids
                    result["R2"] = fastqs.r2
                    result["R2_bid"] = fastqs.r2_bids
                    result["PU"] = fastqs.pu
                    result["ID"] = fastqs.rg_id
            else:
                result[key].append(sample[key])
    result = check_and_return_single_values(result)
    return result