def test_get_oncotree_code_mappings():
    from genie import process_functions
    with mock.patch("genie.process_functions.retry_get_url",
                    return_value=fake_oncotree) as retry_get_url:
        onco_mapping = \
            process_functions.get_oncotree_code_mappings(json_oncotreeurl)
        retry_get_url.called_once_with(json_oncotreeurl)
        assert onco_mapping == expected_onco_mapping
Exemple #2
0
    def _process(self, patientCountsDf, oncotreeLink):
        patientCountsDf['CENTER'] = self.center
        oncotree_mapping_dict = \
            process_functions.get_oncotree_code_mappings(oncotreeLink)
        patientCountsDf['PRIMARY_CODE'] = [
            oncotree_mapping_dict[i.upper()]['ONCOTREE_PRIMARY_NODE']
            for i in patientCountsDf.ONCOTREE_CODE
        ]

        return (patientCountsDf)
Exemple #3
0
    def _validate(self, patCountsDf, oncotreeLink):
        total_error = ""
        warning = ""
        # oncotree_mapping = process_functions.get_oncotree_codes(oncotreeLink)
        # if oncotree_mapping.empty:
        oncotree_mapping = pd.DataFrame()
        oncotree_mapping_dict = \
            process_functions.get_oncotree_code_mappings(oncotreeLink)
        oncotree_mapping['ONCOTREE_CODE'] = oncotree_mapping_dict.keys()

        haveColumn = \
            process_functions.checkColExist(patCountsDf, "ONCOTREE_CODE")

        if haveColumn:
            if sum(patCountsDf['ONCOTREE_CODE'].duplicated()) > 0:
                total_error += (
                    "Patient Counts: "
                    "Must not have any duplicated ONCOTREE CODES.\n")
            if not all(patCountsDf['ONCOTREE_CODE'].isin(
                    oncotree_mapping['ONCOTREE_CODE'])):
                unmapped_oncotrees = patCountsDf[
                    'ONCOTREE_CODE'][~patCountsDf['ONCOTREE_CODE'].
                                     isin(oncotree_mapping['ONCOTREE_CODE'])]
                total_error += (
                    "Patient Counts: Please double check that all your "
                    "ONCOTREE CODES exist in the mapping. You have {} codes "
                    "that don't map. These are the codes that "
                    "don't map: {}\n".format(len(unmapped_oncotrees), ",".join(
                        set(unmapped_oncotrees))))
        else:
            total_error += (
                "Patient Counts: File must have ONCOTREE_CODE column.\n")

        haveColumn = process_functions.checkColExist(patCountsDf,
                                                     "NUM_PATIENTS_PD1_PDL1")

        if haveColumn:
            if not all([
                    isinstance(i, int)
                    for i in patCountsDf['NUM_PATIENTS_PD1_PDL1']
            ]):
                total_error += (
                    "Patient Counts: Must not have any null values, "
                    "and must be all integers.\n")
        else:
            total_error += ("Patient Counts: File must have "
                            "NUM_PATIENTS_PD1_PDL1 column.\n")
        return (total_error, warning)
def main():
    """ 
    Parses a clinical file with a ONCOTREE_CODE column and add/update the CANCER_TYPE and CANCER_TYPE_DETAILED columns inplace
    with values from an oncotree instance.  
    """

    parser = argparse.ArgumentParser()
    parser.add_argument('-o',
                        '--oncotree-url',
                        action='store',
                        dest='oncotree_url',
                        required=True,
                        help='The url of the raw oncotree text file')
    parser.add_argument('-c',
                        '--clinical-file',
                        action='store',
                        dest='clinical_file',
                        required=True,
                        help='Path to the clinical file')
    parser.add_argument('-j',
                        '--json',
                        action='store_true',
                        dest='json',
                        help='If oncotree url is json format')

    args = parser.parse_args()

    oncotree_url = args.oncotree_url
    clinical_filename = args.clinical_file
    json = args.json

    if not os.path.exists(clinical_filename):
        print('clinical file cannot be found ' + clinical_filename)
        sys.exit(2)

    oncotree = process.get_oncotree_codes(oncotree_url)
    if oncotree.empty:
        oncotree = process.get_oncotree_code_mappings(oncotree_url)
        process_clinical_file_json(oncotree, clinical_filename)
    else:
        oncotree = get_oncotree(oncotree_url)
        spreadsheet_fields = [
            i for i in oncotree[0].split("\t") if "level" in i
        ]
        spreadsheet_fields.reverse()
        process_clinical_file(oncotree, clinical_filename, spreadsheet_fields)
    report_failed_matches()
Exemple #5
0
    def _validate(self, clinicaldf, oncotree_link):
        """
        This function validates the clinical file to make sure it adhere
        to the clinical SOP.

        Args:
            clinicalDF: Merged clinical file with patient and sample
                        information
            oncotree_link: Link to oncotree

        Returns:
            Error message
        """
        total_error = StringIO()
        warning = StringIO()

        clinicaldf.columns = [col.upper() for col in clinicaldf.columns]
        # CHECK: for empty rows
        empty_rows = clinicaldf.isnull().values.all(axis=1)
        if empty_rows.any():
            total_error.write("Clinical file(s): No empty rows allowed.\n")
            # Remove completely empty rows to speed up processing
            clinicaldf = clinicaldf[~empty_rows]

        clinicaldf = clinicaldf.fillna("")

        oncotree_mapping_dict = process_functions.get_oncotree_code_mappings(
            oncotree_link
        )
        oncotree_mapping = pd.DataFrame(
            {"ONCOTREE_CODE": list(oncotree_mapping_dict.keys())}
        )

        sampletype_mapping = process_functions.getGenieMapping(self.syn, "syn7434273")

        ethnicity_mapping = process_functions.getGenieMapping(self.syn, "syn7434242")

        race_mapping = process_functions.getGenieMapping(self.syn, "syn7434236")

        sex_mapping = process_functions.getGenieMapping(self.syn, "syn7434222")

        # CHECK: SAMPLE_ID
        sample_id = "SAMPLE_ID"
        haveSampleColumn = process_functions.checkColExist(clinicaldf, sample_id)

        if not haveSampleColumn:
            total_error.write("Sample Clinical File: Must have SAMPLE_ID column.\n")
        else:
            if sum(clinicaldf[sample_id].duplicated()) > 0:
                total_error.write(
                    "Sample Clinical File: No duplicated SAMPLE_ID "
                    "allowed.\nIf there are no duplicated "
                    "SAMPLE_IDs, and both sample and patient files are "
                    "uploaded, then please check to make sure no duplicated "
                    "PATIENT_IDs exist in the patient clinical file.\n"
                )
        # CHECK: PATIENT_ID
        patientId = "PATIENT_ID"
        # #CHECK: PATIENT_ID IN SAMPLE FILE
        havePatientColumn = process_functions.checkColExist(clinicaldf, patientId)

        if not havePatientColumn:
            total_error.write("Patient Clinical File: Must have PATIENT_ID column.\n")

        # CHECK: within the sample file that the sample ids match
        # the patient ids
        if haveSampleColumn and havePatientColumn:
            # Make sure sample and patient ids are string cols
            clinicaldf[sample_id] = clinicaldf[sample_id].astype(str)
            clinicaldf[patientId] = clinicaldf[patientId].astype(str)
            if not all(
                [
                    patient in sample
                    for sample, patient in zip(
                        clinicaldf[sample_id], clinicaldf[patientId]
                    )
                ]
            ):

                total_error.write(
                    "Sample Clinical File: PATIENT_ID's much be contained in "
                    "the SAMPLE_ID's (ex. SAGE-1 <-> SAGE-1-2)\n"
                )
            # #CHECK: All samples must have associated patient data
            # (GENIE requires patient data)
            if not all(clinicaldf[patientId] != ""):
                total_error.write(
                    "Patient Clinical File: All samples must have associated "
                    "patient information and no null patient ids allowed. "
                    "These samples are missing patient data: {}\n".format(
                        ", ".join(
                            clinicaldf[sample_id][clinicaldf[patientId] == ""].unique()
                        )
                    )
                )

            # CHECK: All patients should have associated sample data
            if not all(clinicaldf[sample_id] != ""):
                # ## MAKE WARNING FOR NOW###
                warning.write(
                    "Sample Clinical File: All patients must have associated "
                    "sample information. These patients are missing sample "
                    "data: {}\n".format(
                        ", ".join(
                            clinicaldf[patientId][clinicaldf[sample_id] == ""].unique()
                        )
                    )
                )

        # CHECK: AGE_AT_SEQ_REPORT
        age = "AGE_AT_SEQ_REPORT"
        haveColumn = process_functions.checkColExist(clinicaldf, age)
        if haveColumn:
            # Deal with HIPAA converted rows from DFCI
            # First for loop can't int(text) because there
            # are instances that have <3435
            age_seq_report_df = clinicaldf[
                ~clinicaldf[age].isin(["Unknown", ">32485", "<6570"])
            ]

            # age_seq_report_df[age] = \
            #     remove_greaterthan_lessthan_str(age_seq_report_df[age])

            if not all([process_functions.checkInt(i) for i in age_seq_report_df[age]]):
                total_error.write(
                    "Sample Clinical File: Please double check your "
                    "AGE_AT_SEQ_REPORT. It must be an integer, 'Unknown', "
                    "'>32485', '<6570'.\n"
                )
            else:
                age_seq_report_df[age] = age_seq_report_df[age].astype(int)
                median_age = age_seq_report_df[age].median()
                if median_age < 100:
                    total_error.write(
                        "Sample Clinical File: Please double check your "
                        "AGE_AT_SEQ_REPORT. You may be reporting this value "
                        "in YEARS, please report in DAYS.\n"
                    )
        else:
            total_error.write(
                "Sample Clinical File: Must have AGE_AT_SEQ_REPORT column.\n"
            )

        # CHECK: ONCOTREE_CODE
        haveColumn = process_functions.checkColExist(clinicaldf, "ONCOTREE_CODE")
        maleOncoCodes = ["TESTIS", "PROSTATE", "PENIS"]
        womenOncoCodes = ["CERVIX", "VULVA", "UTERUS", "OVARY"]
        if haveColumn:
            # Make oncotree codes uppercase (SpCC/SPCC)
            clinicaldf["ONCOTREE_CODE"] = (
                clinicaldf["ONCOTREE_CODE"].astype(str).str.upper()
            )

            oncotree_codes = clinicaldf["ONCOTREE_CODE"][
                clinicaldf["ONCOTREE_CODE"] != "UNKNOWN"
            ]

            if not all(oncotree_codes.isin(oncotree_mapping["ONCOTREE_CODE"])):
                unmapped_oncotrees = oncotree_codes[
                    ~oncotree_codes.isin(oncotree_mapping["ONCOTREE_CODE"])
                ]
                total_error.write(
                    "Sample Clinical File: Please double check that all your "
                    "ONCOTREE CODES exist in the mapping. You have {} samples "
                    "that don't map. These are the codes that "
                    "don't map: {}\n".format(
                        len(unmapped_oncotrees),
                        ",".join(set(unmapped_oncotrees)),
                    )
                )
            # Should add the SEX mismatch into the dashboard file
            if (
                process_functions.checkColExist(clinicaldf, "SEX")
                and "oncotree_mapping_dict" in locals()
                and havePatientColumn
                and haveSampleColumn
            ):

                wrongCodeSamples = []
                # This is to check if oncotree codes match the sex,
                # returns list of samples that have conflicting codes and sex
                for code, patient, sample in zip(
                    clinicaldf["ONCOTREE_CODE"],
                    clinicaldf["PATIENT_ID"],
                    clinicaldf["SAMPLE_ID"],
                ):

                    if (
                        oncotree_mapping_dict.get(code) is not None
                        and sum(clinicaldf["PATIENT_ID"] == patient) > 0
                    ):

                        primaryCode = oncotree_mapping_dict[code][
                            "ONCOTREE_PRIMARY_NODE"
                        ]

                        sex = clinicaldf["SEX"][
                            clinicaldf["PATIENT_ID"] == patient
                        ].values[0]
                        sex = float("nan") if sex == "" else float(sex)
                        if (
                            oncotree_mapping_dict[code]["ONCOTREE_PRIMARY_NODE"]
                            in maleOncoCodes
                            and sex != 1.0
                        ):

                            wrongCodeSamples.append(sample)
                        if (
                            oncotree_mapping_dict[code]["ONCOTREE_PRIMARY_NODE"]
                            in womenOncoCodes
                            and sex != 2.0
                        ):

                            wrongCodeSamples.append(sample)
                if len(wrongCodeSamples) > 0:
                    warning.write(
                        "Sample Clinical File: Some SAMPLE_IDs have "
                        "conflicting SEX and ONCOTREE_CODES: {}\n".format(
                            ",".join(wrongCodeSamples)
                        )
                    )
        else:
            total_error.write("Sample Clinical File: Must have ONCOTREE_CODE column.\n")

        warn, error = process_functions.check_col_and_values(
            clinicaldf,
            "SAMPLE_TYPE",
            sampletype_mapping["CODE"].tolist(),
            "Sample Clinical File",
            required=True,
        )
        total_error.write(error)

        # CHECK: SEQ_ASSAY_ID
        haveColumn = process_functions.checkColExist(clinicaldf, "SEQ_ASSAY_ID")
        if haveColumn:
            if not all([i != "" for i in clinicaldf["SEQ_ASSAY_ID"]]):
                total_error.write(
                    "Sample Clinical File: Please double check your "
                    "SEQ_ASSAY_ID columns, there are empty rows.\n"
                )
            # must remove empty seq assay ids first
            # Checking if seq assay ids start with the center name
            empty_seq_idx = clinicaldf.SEQ_ASSAY_ID != ""
            seqassay_ids = clinicaldf.SEQ_ASSAY_ID[empty_seq_idx]
            uniq_seqassay_ids = seqassay_ids.unique()
            invalid_seqassay = []
            for seqassay in uniq_seqassay_ids:
                # SEQ Ids are all capitalized now, so no need to check
                # for differences in case
                if not seqassay.upper().startswith(self.center):
                    invalid_seqassay.append(seqassay)
            if invalid_seqassay:
                total_error.write(
                    "Sample Clinical File: Please make sure your "
                    "SEQ_ASSAY_IDs start with your center "
                    "abbreviation: {}.\n".format(", ".join(invalid_seqassay))
                )
        else:
            total_error.write("Sample Clinical File: Must have SEQ_ASSAY_ID column.\n")

        haveColumn = process_functions.checkColExist(clinicaldf, "SEQ_DATE")
        seq_date_error = (
            "Sample Clinical File: SEQ_DATE must be one of five values- "
            "For Jan-March: use Jan-YEAR. "
            "For Apr-June: use Apr-YEAR. "
            "For July-Sep: use Jul-YEAR. "
            "For Oct-Dec: use Oct-YEAR. (ie. Apr-2017) "
            "For values that don't have SEQ_DATES that "
            "you want released use 'release'.\n"
        )

        if haveColumn:
            clinicaldf["SEQ_DATE"] = [
                i.title() for i in clinicaldf["SEQ_DATE"].astype(str)
            ]

            seqdate = clinicaldf["SEQ_DATE"][clinicaldf["SEQ_DATE"] != "Release"]
            if sum(clinicaldf["SEQ_DATE"] == "") > 0:
                total_error.write(
                    "Sample Clinical File: Samples without SEQ_DATEs will "
                    "NOT be released.\n"
                )
            try:
                if not seqdate.empty:
                    seqdate.apply(
                        lambda date: datetime.datetime.strptime(date, "%b-%Y")
                    )
                    if not seqdate.str.startswith(("Jan", "Apr", "Jul", "Oct")).all():
                        total_error.write(seq_date_error)
            except ValueError:
                total_error.write(seq_date_error)
        else:
            total_error.write("Sample Clinical File: Must have SEQ_DATE column.\n")

        # CHECK: BIRTH_YEAR
        error = _check_year(
            clinicaldf=clinicaldf,
            year_col="BIRTH_YEAR",
            filename="Patient Clinical File",
            allowed_string_values=["Unknown", ">89", "<18"],
        )
        total_error.write(error)

        # CHECK: YEAR DEATH
        error = _check_year(
            clinicaldf=clinicaldf,
            year_col="YEAR_DEATH",
            filename="Patient Clinical File",
            allowed_string_values=[
                "Unknown",
                "Not Collected",
                "Not Applicable",
                "Not Released",
                ">89",
                "<18",
            ],
        )
        total_error.write(error)

        # CHECK: YEAR CONTACT
        error = _check_year(
            clinicaldf=clinicaldf,
            year_col="YEAR_CONTACT",
            filename="Patient Clinical File",
            allowed_string_values=[
                "Unknown",
                "Not Collected",
                "Not Released",
                ">89",
                "<18",
            ],
        )
        total_error.write(error)

        # CHECK: INT CONTACT
        haveColumn = process_functions.checkColExist(clinicaldf, "INT_CONTACT")
        if haveColumn:
            if not all(
                [
                    process_functions.checkInt(i)
                    for i in clinicaldf.INT_CONTACT
                    if i
                    not in [
                        ">32485",
                        "<6570",
                        "Unknown",
                        "Not Collected",
                        "Not Released",
                    ]
                ]
            ):

                total_error.write(
                    "Patient Clinical File: Please double check your "
                    "INT_CONTACT column, it must be an integer, '>32485', "
                    "'<6570', 'Unknown', 'Not Released' or 'Not Collected'.\n"
                )
        else:
            total_error.write("Patient Clinical File: Must have INT_CONTACT column.\n")

        # INT DOD
        haveColumn = process_functions.checkColExist(clinicaldf, "INT_DOD")
        if haveColumn:
            if not all(
                [
                    process_functions.checkInt(i)
                    for i in clinicaldf.INT_DOD
                    if i
                    not in [
                        ">32485",
                        "<6570",
                        "Unknown",
                        "Not Collected",
                        "Not Applicable",
                        "Not Released",
                    ]
                ]
            ):

                total_error.write(
                    "Patient Clinical File: Please double check your INT_DOD "
                    "column, it must be an integer, '>32485', '<6570', "
                    "'Unknown', 'Not Collected', 'Not Released' or "
                    "'Not Applicable'.\n"
                )
        else:
            total_error.write("Patient Clinical File: Must have INT_DOD column.\n")

        haveColumn = process_functions.checkColExist(clinicaldf, "DEAD")
        if haveColumn:
            # Need to have check_bool function
            if not all(
                [
                    str(i).upper() in ["TRUE", "FALSE"]
                    for i in clinicaldf.DEAD
                    if i not in ["Unknown", "Not Collected", "Not Released"]
                ]
            ):
                total_error.write(
                    "Patient Clinical File: Please double check your "
                    "DEAD column, it must be True, False, 'Unknown', "
                    "'Not Released' or 'Not Collected'.\n"
                )
        else:
            total_error.write("Patient Clinical File: Must have DEAD column.\n")
        # CHECK: contact vital status value consistency
        contact_error = _check_int_year_consistency(
            clinicaldf=clinicaldf,
            cols=["YEAR_CONTACT", "INT_CONTACT"],
            string_vals=["Not Collected", "Unknown", "Not Released"],
        )
        total_error.write(contact_error)

        # CHECK: death vital status value consistency
        death_error = _check_int_year_consistency(
            clinicaldf=clinicaldf,
            cols=["YEAR_DEATH", "INT_DOD"],
            string_vals=[
                "Not Collected",
                "Unknown",
                "Not Applicable",
                "Not Released",
            ],
        )
        total_error.write(death_error)
        death_error = _check_int_dead_consistency(clinicaldf=clinicaldf)
        total_error.write(death_error)

        # CHECK: SAMPLE_CLASS is optional attribute
        have_column = process_functions.checkColExist(clinicaldf, "SAMPLE_CLASS")
        if have_column:
            sample_class_vals = pd.Series(clinicaldf["SAMPLE_CLASS"].unique().tolist())
            if not sample_class_vals.isin(["Tumor", "cfDNA"]).all():
                total_error.write(
                    "Sample Clinical File: SAMPLE_CLASS column must "
                    "be 'Tumor', or 'cfDNA'\n"
                )

        # CHECK: PRIMARY_RACE
        warn, error = process_functions.check_col_and_values(
            clinicaldf,
            "PRIMARY_RACE",
            race_mapping["CODE"].tolist(),
            "Patient Clinical File",
        )
        warning.write(warn)
        total_error.write(error)

        # CHECK: SECONDARY_RACE
        warn, error = process_functions.check_col_and_values(
            clinicaldf,
            "SECONDARY_RACE",
            race_mapping["CODE"].tolist(),
            "Patient Clinical File",
        )
        warning.write(warn)
        total_error.write(error)

        # CHECK: TERTIARY_RACE
        warn, error = process_functions.check_col_and_values(
            clinicaldf,
            "TERTIARY_RACE",
            race_mapping["CODE"].tolist(),
            "Patient Clinical File",
        )
        warning.write(warn)
        total_error.write(error)

        # CHECK: SEX
        warn, error = process_functions.check_col_and_values(
            clinicaldf,
            "SEX",
            sex_mapping["CODE"].tolist(),
            "Patient Clinical File",
            required=True,
        )
        warning.write(warn)
        total_error.write(error)

        # CHECK: ETHNICITY
        warn, error = process_functions.check_col_and_values(
            clinicaldf,
            "ETHNICITY",
            ethnicity_mapping["CODE"].tolist(),
            "Patient Clinical File",
        )
        warning.write(warn)
        total_error.write(error)

        return total_error.getvalue(), warning.getvalue()
Exemple #6
0
    def process_steps(
        self,
        clinicalDf,
        databaseToSynIdMappingDf,
        newPath,
        parentId,
        oncotree_link,
        clinicalTemplate,
        sample,
        patient,
        patientCols,
        sampleCols,
    ):
        """Process clincial file, redact PHI values, upload to clinical
        database
        """
        patientdb_idx = databaseToSynIdMappingDf["Database"] == "patient"
        patient_synid = databaseToSynIdMappingDf.Id[patientdb_idx][0]
        sampledb_idx = databaseToSynIdMappingDf["Database"] == "sample"
        sample_synid = databaseToSynIdMappingDf.Id[sampledb_idx][0]

        newClinicalDf = self._process(clinicalDf, clinicalTemplate)
        newClinicalDf = redact_phi(newClinicalDf)

        if patient:
            cols = newClinicalDf.columns[newClinicalDf.columns.isin(patientCols)]
            patientClinical = newClinicalDf[cols].drop_duplicates("PATIENT_ID")
            self.uploadMissingData(
                patientClinical, "PATIENT_ID", patient_synid, parentId
            )

            process_functions.updateData(
                self.syn,
                patient_synid,
                patientClinical,
                self.center,
                col=cols.tolist(),
                toDelete=True,
            )
        if sample:
            cols = newClinicalDf.columns[newClinicalDf.columns.isin(sampleCols)]
            if sum(newClinicalDf["SAMPLE_ID"].duplicated()) > 0:
                logger.error(
                    "There are duplicated samples, " "and the duplicates are removed"
                )
            sampleClinical = newClinicalDf[cols].drop_duplicates("SAMPLE_ID")
            # Exclude all clinical samples with wrong oncotree codes
            oncotree_mapping = pd.DataFrame()
            oncotree_mapping_dict = process_functions.get_oncotree_code_mappings(
                oncotree_link
            )
            # Add in unknown key for oncotree code
            oncotree_mapping_dict["UNKNOWN"] = {}
            oncotree_mapping["ONCOTREE_CODE"] = list(oncotree_mapping_dict.keys())
            # Make oncotree codes uppercase (SpCC/SPCC)
            sampleClinical["ONCOTREE_CODE"] = (
                sampleClinical["ONCOTREE_CODE"].astype(str).str.upper()
            )
            sampleClinical = sampleClinical[
                sampleClinical["ONCOTREE_CODE"].isin(oncotree_mapping["ONCOTREE_CODE"])
            ]
            self.uploadMissingData(sampleClinical, "SAMPLE_ID", sample_synid, parentId)
            # ,retractedSampleSynId)
            process_functions.updateData(
                self.syn,
                sample_synid,
                sampleClinical,
                self.center,
                col=cols.tolist(),
                toDelete=True,
            )

        newClinicalDf.to_csv(newPath, sep="\t", index=False)
        return newPath
Exemple #7
0
    def run(self):
        """
        This function runs the redcap export to export all files
        """
        if not os.path.exists(self._SPONSORED_PROJECT):
            os.mkdir(self._SPONSORED_PROJECT)
        else:
            filelists = os.listdir(self._SPONSORED_PROJECT)
            for file in filelists:
                if file != "case_lists":
                    os.remove(os.path.join(self._SPONSORED_PROJECT, file))
        # Create full mapping table to get the values of the data model
        mapping = self.syn.tableQuery(
            "select genie_field_name,instrument from {} where "
            "{} is true and phi is false".format(
                self._DATA_ELEMENT_SYN_ID, self._SPONSORED_PROJECT.lower()))
        mappingDf = mapping.asDataFrame()
        newMappingDf = pd.DataFrame()
        for field, instrument in zip(mappingDf.genie_field_name,
                                     mappingDf.instrument):
            # Do not want to append the # values
            if "#" in field:
                # find fields with # and replace with however many times
                # it should loop through
                newfields = [
                    field.replace("#", str(count))
                    for count in list(range(1, self._NUM_COUNTS + 1))
                ]
                newDataFrame = pd.DataFrame({
                    "genie_field_name":
                    newfields,
                    "instrument": [instrument] * len(newfields)
                })
            else:
                newDataFrame = pd.DataFrame(
                    {
                        "genie_field_name": field,
                        "instrument": instrument
                    },
                    index=[0])
            newMappingDf = newMappingDf.append(newDataFrame, sort=False)

        # If there are ever missing fields, they must be added in
        # or else the script will fail
        # missingFields= ['her_status_sample','sample_seq_yn']
        # missingFieldType = ['sample_information']*2
        # newMappingDf = newMappingDf.append(pd.DataFrame({
        #     "genie_field_name": missingFields,
        #     "instrument": missingFieldType}))

        # Extract patient/sample/treatment columns
        patientCols = extractColumns(newMappingDf, [
            "patient_information", "treatment_information_general",
            "diagnosis_information"
        ], [
            'errors_patient_info_yn', 'patient_info_errors',
            'errors_dx_info_yn', 'dx_info_errors', 'so_yn'
        ])
        sampleCols = extractColumns(newMappingDf, ["sample_information"], [
            "test_sample", "fgfr4_variant", "errors_sample_info_yn",
            "sample_info_errors"
        ])
        treatmentCols = extractColumns(newMappingDf,
                                       ["treatment_information_detailed"], [])

        unlabelledEnt = self.syn.get(self._UNLABELLED_SYN_ID)
        labelledEnt = self.syn.get(self._LABELLED_SYN_ID)
        unlabeledDf = pd.read_csv(unlabelledEnt.path)
        labeledDf = pd.read_csv(labelledEnt.path)
        # Add on CENTER column for all three file formats
        patientCols.append("redcap_data_access_group")
        sampleCols.append("redcap_data_access_group")
        treatmentCols.append("redcap_data_access_group")

        labeledDf.columns = unlabeledDf.columns
        labeledDf['redcap_data_access_group'][
            labeledDf['redcap_data_access_group'] == "hop"] = "JHU"
        labeledDf['redcap_data_access_group'] = \
            labeledDf['redcap_data_access_group'].apply(lambda x: x.upper())

        patientDf = labeledDf[patientCols]
        patientRows = labeledDf.redcap_repeat_instrument.isnull()
        patientDf = patientDf[patientRows]

        sampleDf = labeledDf[sampleCols]
        sampleRows = labeledDf.redcap_repeat_instrument == "Sample Information"
        sampleDf = sampleDf[sampleRows]
        # Red cap header to cbio header Table mapping
        redCapToCbioMapping = self.syn.tableQuery(
            "SELECT * FROM %s" % self._REDCAP_TO_CBIOMAPPING_SYNID)
        redCapToCbioMappingDf = redCapToCbioMapping.asDataFrame()

        # Get all the samples/patients that should be uploaded to SP projects
        # Hard coded clinical database
        genie_clinicalDb = self.syn.tableQuery(
            'select SAMPLE_ID, PATIENT_ID, ONCOTREE_CODE, SEQ_ASSAY_ID '
            'from syn7517674')
        genie_clinicalDf = genie_clinicalDb.asDataFrame()
        # Hard coded clinicalSP database
        # nonGenie_clinicalDb = self.syn.tableQuery(
        #     'SELECT * FROM syn11492579')
        # nonGenie_clinicalDf = nonGenie_clinicalDb.asDataFrame()
        # genie_clinicalDf = genie_clinicalDf.append(nonGenie_clinicalDf)

        # Only patients and samples that exist in the
        # sponsored project uploads are going to be pulled into the SP project
        finalPatientDf = self.configureClinicalDf(patientDf,
                                                  redCapToCbioMappingDf)
        patient_date_col = [
            col for col in finalPatientDf.columns if col.endswith("INT")
        ]
        patient_date_col.append("OS_MONTHS")
        final_patientdf_datesdays = finalPatientDf.copy()
        finalPatientDf[patient_date_col] = \
            finalPatientDf[patient_date_col].applymap(change_days_to_months)
        subsetPatientDf = finalPatientDf[finalPatientDf['PATIENT_ID'].isin(
            genie_clinicalDf['PATIENT_ID'])]
        del subsetPatientDf['SP']
        # Remove CENTER and ONCOTREE_CODE from patient because you
        # cannot have these columns in both sample and patient Df,
        # it will fail validation for cbioportal
        del subsetPatientDf['CENTER']
        del subsetPatientDf['ONCOTREE_CODE']

        patientPath = self.writeClinicalFile(subsetPatientDf,
                                             redCapToCbioMappingDf, "patient")

        finalSampleDf = self.configureClinicalDf(sampleDf,
                                                 redCapToCbioMappingDf)

        sample_date_cols = ['SAMPLE_DATE_INT', 'AGE_AT_SEQ_REPORT']
        final_sampledf_datesdays = finalSampleDf.copy()
        finalSampleDf[sample_date_cols] = \
            finalSampleDf[sample_date_cols].applymap(change_days_to_months)
        # Fill in ONCOTREE_CODE
        finalSampleDf['ONCOTREE_CODE'] = [
            genie_clinicalDf['ONCOTREE_CODE'][genie_clinicalDf['SAMPLE_ID'] ==
                                              sample].values[0] if
            sum(genie_clinicalDf['SAMPLE_ID'] == sample) > 0 else float('nan')
            for sample in finalSampleDf['SAMPLE_ID']
        ]
        # Fill in SEQ_ASSAY_ID
        finalSampleDf['SEQ_ASSAY_ID'] = [
            genie_clinicalDf['SEQ_ASSAY_ID'][genie_clinicalDf['SAMPLE_ID'] ==
                                             sample].values[0] if
            sum(genie_clinicalDf['SAMPLE_ID'] == sample) > 0 else float('nan')
            for sample in finalSampleDf['SAMPLE_ID']
        ]

        subsetSampleDf = finalSampleDf[finalSampleDf['SAMPLE_ID'].isin(
            genie_clinicalDf['SAMPLE_ID'])]
        del subsetSampleDf['SP']

        samplePath = self.writeClinicalFile(subsetSampleDf,
                                            redCapToCbioMappingDf, "sample")

        # Remove oncotree code here, because no longer need it
        mergedClinicalDf = subsetSampleDf.merge(subsetPatientDf,
                                                on="PATIENT_ID",
                                                how="outer")

        if mergedClinicalDf.get("SAMPLE_ID") is not None:
            print("Samples not in GENIE clinical databases (SP and normal)")
            notFoundSamples = mergedClinicalDf[
                'SAMPLE_ID'][~mergedClinicalDf['SAMPLE_ID'].
                             isin(genie_clinicalDf['SAMPLE_ID'])]
            if not notFoundSamples.empty:
                print(notFoundSamples[~notFoundSamples.isnull()])
                notFoundSamples.to_csv("notfoundsamples.csv", header=False)
                if not self.staging:
                    self.syn.store(
                        synapseclient.File(
                            "notfoundsamples.csv",
                            parent=self._SP_REDCAP_EXPORTS_SYNID))

        # Hard coded most up to date oncotree version
        oncotreeLink = self.syn.get("syn13890902").externalURL
        # Use the old oncotree link for now
        oncotreeLink = 'http://oncotree.mskcc.org/api/tumorTypes/tree?version=oncotree_2017_06_21'
        oncotreeDict = \
            process_functions.get_oncotree_code_mappings(oncotreeLink)
        mergedClinicalDf['CANCER_TYPE'] = [
            oncotreeDict[code.upper()].get("CANCER_TYPE", float('nan'))
            for code in mergedClinicalDf['ONCOTREE_CODE']
        ]
        mergedClinicalDf['CANCER_TYPE_DETAILED'] = [
            oncotreeDict[code.upper()].get("CANCER_TYPE_DETAILED",
                                           float('nan'))
            for code in mergedClinicalDf['ONCOTREE_CODE']
        ]
        mergedClinicalDf['ONCOTREE_PRIMARY_NODE'] = [
            oncotreeDict[code.upper()].get("ONCOTREE_PRIMARY_NODE",
                                           float('nan'))
            for code in mergedClinicalDf['ONCOTREE_CODE']
        ]
        mergedClinicalDf['ONCOTREE_SECONDARY_NODE'] = [
            oncotreeDict[code.upper()].get("ONCOTREE_SECONDARY_NODE",
                                           float('nan'))
            for code in mergedClinicalDf['ONCOTREE_CODE']
        ]

        mergedClinicalDf.to_csv("%s/data_clinical.txt" %
                                self._SPONSORED_PROJECT,
                                index=False,
                                sep="\t")

        if not self.staging:
            process_functions.updateData(self.syn,
                                         "syn17010637",
                                         finalPatientDf,
                                         self._SPONSORED_PROJECT,
                                         filterByColumn="SP",
                                         toDelete=True)

            patientFileEnt = File(patientPath, parent=self._SP_SYN_ID)
            patientEnt = self.syn.store(patientFileEnt,
                                        used=labelledEnt.id,
                                        executed=self._GITHUB_REPO)

            process_functions.updateData(self.syn,
                                         "syn17010638",
                                         finalSampleDf,
                                         self._SPONSORED_PROJECT,
                                         filterByColumn="SP",
                                         toDelete=True)

            sampleFileEnt = File(samplePath, parent=self._SP_SYN_ID)
            sampleEnt = self.syn.store(sampleFileEnt,
                                       used=labelledEnt.id,
                                       executed=self._GITHUB_REPO)

        treatmentDf = labeledDf[treatmentCols]
        treatmentRows = labeledDf.redcap_repeat_instrument == \
            "Treatment Information Detailed"
        treatmentDf = treatmentDf[treatmentRows]
        finalTimelineDf = self.makeTimeLineDf(treatmentDf,
                                              final_patientdf_datesdays)
        finalTimelineDf.PATIENT_ID = finalTimelineDf.apply(
            lambda x: process_functions.checkGenieId(x['PATIENT_ID'], x[
                'CENTER']),
            axis=1)
        if not self.staging:
            process_functions.updateData(self.syn,
                                         "syn17011214",
                                         finalTimelineDf,
                                         self._SPONSORED_PROJECT,
                                         filterByColumn="SP",
                                         toDelete=True)

        # METASTATIC DIAGNOSIS (append to timeline)
        metaDiagnosisDf = self.createMetaDiagnosisDf(finalTimelineDf)
        # Maintain ordering of timeline
        ordering = finalTimelineDf.columns.tolist()
        # Two extra timeline columns from specimen file
        ordering.extend(["SAMPLE_ID", "SAMPLE_NOTES"])
        finalTimelineDf = finalTimelineDf.append(metaDiagnosisDf, sort=False)

        # Create specimen file to append to timeline file too
        specimenDf = self.createSpecimenDf(final_sampledf_datesdays,
                                           final_patientdf_datesdays)
        specimenDf = specimenDf[specimenDf['SAMPLE_ID'].isin(
            genie_clinicalDf['SAMPLE_ID'])]
        # dates = ['START_DATE', 'STOP_DATE', 'LINE_START']
        finalTimelineDf = finalTimelineDf.append(specimenDf, sort=False)
        # No need to convert timeline dates to months
        # finalTimelineDf[dates] = \
        #     finalTimelineDf[dates].applymap(change_days_to_months)
        finalTimelineDf = finalTimelineDf[ordering]
        finalTimelineDf = finalTimelineDf[finalTimelineDf['PATIENT_ID'].isin(
            genie_clinicalDf['PATIENT_ID'])]
        finalTimelineDf['AGENT'][finalTimelineDf['AGENT'].isnull()] = "Unknown"
        timelineText = finalTimelineDf.to_csv(index=False, sep="\t")
        timelineText = replace0(timelineText)
        timeline_path = "%s/data_timeline.txt" % self._SPONSORED_PROJECT
        with open(timeline_path, 'w') as timelineFile:
            timelineFile.write(timelineText)
        if not self.staging:
            fileEnt = File(timeline_path, parent=self._SP_SYN_ID)
            self.syn.store(fileEnt,
                           used=labelledEnt.id,
                           executed=self._GITHUB_REPO)

        # Get database to synapse id mapping table so no need to
        # hardcode synapse ids
        databaseToSynIdMapping = \
            self.syn.tableQuery('SELECT * FROM syn10967259')
        databaseToSynIdMappingDf = databaseToSynIdMapping.asDataFrame()

        centerMafFileViewSynId = databaseToSynIdMappingDf['Id'][
            databaseToSynIdMappingDf['Database'] == "centerMafView"][0]
        centerMafSynIds = self.syn.tableQuery(
            "select id from {} where name like '%mutation%'".format(
                centerMafFileViewSynId))
        centerMafSynIdsDf = centerMafSynIds.asDataFrame()
        # This value must be set outside here because the first maf file might
        # Not be part of the centers
        index = 0
        mafpath = "{}/data_mutations_extended.txt".format(
            self._SPONSORED_PROJECT)
        for mafSynId in centerMafSynIdsDf.id:
            mafEnt = self.syn.get(mafSynId, downloadFile=False)
            mafcenter = mafEnt.name.split("_")[3]
            if mafcenter in finalSampleDf['CENTER'].tolist():
                mafEnt = self.syn.get(mafSynId)
                print("running", mafEnt.name)
                with open(mafEnt.path, "r") as mafFile:
                    header = mafFile.readline()
                    headers = header.replace("\n", "").split("\t")
                    if index == 0:
                        with open(mafpath, 'w') as f:
                            f.write(header)
                    index += 1
                    for row in mafFile:
                        rowArray = row.replace("\n", "").split("\t")
                        center = rowArray[headers.index('Center')]
                        newMergedRow = configureMafRow(
                            rowArray, headers, finalSampleDf['SAMPLE_ID'])
                        if newMergedRow is not None:
                            with open(mafpath, 'a') as f:
                                f.write(newMergedRow)
        # No longer need to pulling from non genie db
        fileEnt = File(mafpath, parent=self._SP_SYN_ID)
        if not self.staging:
            self.syn.store(fileEnt,
                           used=centerMafSynIdsDf.id.tolist(),
                           executed=self._GITHUB_REPO)

        CNA_PATH = "%s/data_CNA.txt" % self._SPONSORED_PROJECT
        CNA_CENTER_PATH = self._SPONSORED_PROJECT + "/data_CNA_%s.txt"
        centerCNASynIds = self.syn.tableQuery(
            "select id from {} where name like 'data_CNA%'".format(
                centerMafFileViewSynId))
        centerCNASynIdsDf = centerCNASynIds.asDataFrame()
        # Grab all unique symbols and form cnaTemplate
        allSymbols = set()

        for cnaSynId in centerCNASynIdsDf.id:
            cnaEnt = self.syn.get(cnaSynId)
            with open(cnaEnt.path, "r") as cnaFile:
                # Read first line first to get all the samples
                cnaFile.readline()
                # Get all hugo symbols
                allSymbols = allSymbols.union(
                    set(line.split("\t")[0] for line in cnaFile))
        cnaTemplate = pd.DataFrame({"Hugo_Symbol": list(allSymbols)})
        cnaTemplate.sort_values("Hugo_Symbol", inplace=True)
        cnaTemplate.to_csv(CNA_PATH, sep="\t", index=False)

        withMergedHugoSymbol = pd.Series("Hugo_Symbol")
        withMergedHugoSymbol = \
            withMergedHugoSymbol.append(pd.Series(finalSampleDf['SAMPLE_ID']))
        cnaSamples = []

        for cnaSynId in centerCNASynIdsDf.id:
            cnaEnt = self.syn.get(cnaSynId)
            center = cnaEnt.name.replace("data_CNA_", "").replace(".txt", "")
            print(cnaEnt.path)
            # if center in CENTER_MAPPING_DF.center.tolist():
            centerCNA = pd.read_csv(cnaEnt.path, sep="\t")
            merged = cnaTemplate.merge(centerCNA,
                                       on="Hugo_Symbol",
                                       how="outer")
            merged.sort_values("Hugo_Symbol", inplace=True)

            # This is to remove more samples for the final cna file
            merged = merged[merged.columns[merged.columns.isin(
                withMergedHugoSymbol)]]

            cnaText = process_functions.removePandasDfFloat(merged)
            # Must do this replace twice because \t\t\t ->
            # \tNA\t\t -> \tNA\tNA\t
            cnaText = cnaText.replace("\t\t", "\tNA\t").replace(
                "\t\t", "\tNA\t").replace('\t\n', "\tNA\n")

            with open(CNA_CENTER_PATH % center, "w") as cnaFile:
                cnaFile.write(cnaText)
            cnaSamples.extend(merged.columns[1:].tolist())

            # Join CNA file
            joinCommand = ["join", CNA_PATH, CNA_CENTER_PATH % center]
            output = subprocess.check_output(joinCommand)
            with open(CNA_PATH, "w") as cnaFile:
                cnaFile.write(output.decode("utf-8").replace(" ", "\t"))

        fileEnt = File(CNA_PATH, parent=self._SP_SYN_ID)
        if not self.staging:
            self.syn.store(fileEnt,
                           used=centerCNASynIdsDf.id.tolist(),
                           executed=self._GITHUB_REPO)

        self.createGeneMatrixDf(finalSampleDf, cnaSamples, labelledEnt)

        fusion = self.syn.tableQuery("SELECT * FROM syn7893268 where "
                                     "TUMOR_SAMPLE_BARCODE in ('{}')".format(
                                         "','".join(
                                             finalSampleDf['SAMPLE_ID'])))
        fusions_df = fusion.asDataFrame()

        if not fusions_df.empty:
            fusions_df = fusions_df.rename(
                columns={
                    'HUGO_SYMBOL': 'Hugo_Symbol',
                    'ENTREZ_GENE_ID': 'Entrez_Gene_Id',
                    'CENTER': 'Center',
                    'TUMOR_SAMPLE_BARCODE': 'Tumor_Sample_Barcode',
                    'FUSION': 'Fusion',
                    'DNA_SUPPORT': 'DNA_support',
                    'RNA_SUPPORT': 'RNA_support',
                    'METHOD': 'Method',
                    'FRAME': 'Frame',
                    'COMMENTS': 'Comments'
                })
            fusions_df.Entrez_Gene_Id[fusions_df.Entrez_Gene_Id ==
                                      0] = pd.np.nan
            fusionText = fusions_df.to_csv(sep="\t", index=False)
            fusionText = replace0(fusionText)
            fusion_path = "%s/data_fusions.txt" % self._SPONSORED_PROJECT
            with open(fusion_path, "w") as fusionFile:
                fusionFile.write(fusionText)
            fileEnt = File(fusion_path, parent=self._SP_SYN_ID)
            if not self.staging:
                self.syn.store(fileEnt,
                               used=fusion.tableId,
                               executed=self._GITHUB_REPO)

        seg = self.syn.tableQuery(
            "SELECT ID, CHROM, LOCSTART, LOCEND, NUMMARK, SEGMEAN "
            "FROM syn7893341 where ID in ('{}')".format("','".join(
                finalSampleDf['SAMPLE_ID'])))
        seg_df = seg.asDataFrame()
        if not seg_df.empty:
            seg_df.rename(columns={
                "CHROM": "chrom",
                "LOCSTART": "loc.start",
                "LOCEND": "loc.end",
                "NUMMARK": "num.mark",
                "SEGMEAN": "seg.mean"
            },
                          inplace=True)
            segText = replace0(seg_df.to_csv(sep="\t", index=False))
            segpath = "{}/genie_{}_data_cna_hg19.seg".format(
                self._SPONSORED_PROJECT, self._SPONSORED_PROJECT.lower())
            with open(segpath, 'w') as segFile:
                segFile.write(segText)
            fileEnt = File(segpath, parent=self._SP_SYN_ID)
            if not self.staging:
                self.syn.store(fileEnt,
                               used=seg.tableId,
                               executed=self._GITHUB_REPO)

        # Create case lists
        if not os.path.exists(self._CASE_LIST_PATH):
            os.mkdir(self._CASE_LIST_PATH)
        else:
            caselists = os.listdir(self._CASE_LIST_PATH)
            for caselist in caselists:
                os.remove(os.path.join(self._CASE_LIST_PATH, caselist))

        # Write out cases sequenced so people can tell
        # which samples were sequenced
        create_case_lists.main(
            "%s/data_clinical.txt" % self._SPONSORED_PROJECT,
            "%s/data_gene_matrix.txt" % self._SPONSORED_PROJECT,
            self._CASE_LIST_PATH,
            "genie_{}".format(self._SPONSORED_PROJECT.lower()))

        caseListFiles = os.listdir(self._CASE_LIST_PATH)
        for casePath in caseListFiles:
            casePath = os.path.join(self._CASE_LIST_PATH, casePath)
            fileEnt = File(casePath, parent=self._CASE_LIST_SYN_ID)
            if not self.staging:
                self.syn.store(fileEnt,
                               used=[patientEnt.id, sampleEnt.id],
                               executed=self._GITHUB_REPO)

        seq_assays = "','".join(set(finalSampleDf['SEQ_ASSAY_ID']))
        bed = self.syn.tableQuery(
            "SELECT Hugo_Symbol, SEQ_ASSAY_ID FROM syn8457748 where "
            "SEQ_ASSAY_ID in ('{}') and "
            "Feature_Type = 'exon' and "
            "Hugo_Symbol is not null and "
            "includeInPanel is true".format(seq_assays))
        beddf = bed.asDataFrame()
        bed = self.syn.tableQuery(
            "SELECT Hugo_Symbol, SEQ_ASSAY_ID FROM syn11516678 where "
            "SEQ_ASSAY_ID in ('{}') and "
            "Feature_Type = 'exon' and "
            "Hugo_Symbol is not null and "
            "includeInPanel is true".format(seq_assays))
        non_genie_beddf = bed.asDataFrame()
        beddf = beddf.append(non_genie_beddf)
        seq_assay_groups = beddf.groupby('SEQ_ASSAY_ID')
        for seq_assay_id, seqdf in seq_assay_groups:
            unique_genes = seqdf.Hugo_Symbol.unique()
            gene_panel_text = ("stable_id: {seq_assay_id}\n"
                               "description: {seq_assay_id}, "
                               "Number of Genes - {num_genes}\n"
                               "gene_list:\t{genelist}".format(
                                   seq_assay_id=seq_assay_id,
                                   num_genes=len(unique_genes),
                                   genelist="\t".join(unique_genes)))
            gene_panel_name = "data_gene_panel_" + seq_assay_id + ".txt"
            gene_panel_path = os.path.join(self._SPONSORED_PROJECT,
                                           gene_panel_name)
            with open(gene_panel_path, "w+") as f:
                f.write(gene_panel_text)
            fileEnt = File(gene_panel_path, parent=self._SP_SYN_ID)
            if not self.staging:
                self.syn.store(fileEnt, executed=self._GITHUB_REPO)

        # Make sure to re download all the metadata files again
        self.reviseMetadataFiles()

        cmd = [
            'python',
            os.path.join(self.cbioPath,
                         "core/src/main/scripts/importer/validateData.py"),
            "-s", self._SPONSORED_PROJECT, "-n"
        ]
        subprocess.call(cmd)
Exemple #8
0
    def _validate(self, clinicalDF, oncotreeLink):
        """
        This function validates the clinical file to make sure it adhere
        to the clinical SOP.

        Args:
            clinicalDF: Merged clinical file with patient and sample
                        information
            oncotreeLink: Link to oncotree

        Returns:
            Error message
        """
        total_error = ""
        warning = ""

        clinicalDF.columns = [col.upper() for col in clinicalDF.columns]
        clinicalDF = clinicalDF.fillna("")

        # oncotree_mapping = process_functions.get_oncotree_codes(oncotreeLink)
        # if oncotree_mapping.empty:
        oncotree_mapping = pd.DataFrame()
        oncotree_mapping_dict = \
            process_functions.get_oncotree_code_mappings(oncotreeLink)
        oncotree_mapping['ONCOTREE_CODE'] = oncotree_mapping_dict.keys()

        sampleType_mapping = \
            process_functions.getGenieMapping(self.syn, "syn7434273")

        ethnicity_mapping = \
            process_functions.getGenieMapping(self.syn, "syn7434242")

        race_mapping = \
            process_functions.getGenieMapping(self.syn, "syn7434236")

        sex_mapping = \
            process_functions.getGenieMapping(self.syn, "syn7434222")

        # CHECK: SAMPLE_ID
        sampleId = 'SAMPLE_ID'
        haveSampleColumn = \
            process_functions.checkColExist(clinicalDF, sampleId)

        if not haveSampleColumn:
            total_error += \
                "Sample Clinical File: Must have SAMPLE_ID column.\n"
        else:
            if sum(clinicalDF[sampleId].duplicated()) > 0:
                total_error += (
                    "Sample Clinical File: No duplicated SAMPLE_ID "
                    "allowed.\nIf there are no duplicated "
                    "SAMPLE_IDs, and both sample and patient files are "
                    "uploaded, then please check to make sure no duplicated "
                    "PATIENT_IDs exist in the patient clinical file.\n")

        # CHECK: PATIENT_ID
        patientId = "PATIENT_ID"
        # #CHECK: PATIENT_ID IN SAMPLE FILE
        havePatientColumn = \
            process_functions.checkColExist(clinicalDF, patientId)

        if not havePatientColumn:
            total_error += \
                "Patient Clinical File: Must have PATIENT_ID column.\n"

        # CHECK: within the sample file that the sample ids match
        # the patient ids
        if haveSampleColumn and havePatientColumn:
            # Make sure sample and patient ids are string cols
            clinicalDF[sampleId] = clinicalDF[sampleId].astype(str)
            clinicalDF[patientId] = clinicalDF[patientId].astype(str)
            if not all([
                    patient in sample for sample, patient in zip(
                        clinicalDF[sampleId], clinicalDF[patientId])
            ]):

                total_error += (
                    "Sample Clinical File: PATIENT_ID's much be contained in "
                    "the SAMPLE_ID's (ex. SAGE-1 <-> SAGE-1-2)\n")
            # #CHECK: All samples must have associated patient data
            # (GENIE requires patient data)
            if not all(clinicalDF[patientId] != ""):
                total_error += (
                    "Patient Clinical File: All samples must have associated "
                    "patient information and no null patient ids allowed. "
                    "These samples are missing patient data: {}\n".format(
                        ", ".join(
                            clinicalDF[sampleId][clinicalDF[patientId] == ""]))
                )
            # CHECK: All patients should have associated sample data
            if not all(clinicalDF[sampleId] != ""):
                # ## MAKE WARNING FOR NOW###
                warning += (
                    "Sample Clinical File: All patients must have associated "
                    "sample information. These patients are missing sample "
                    "data: {}\n".format(", ".join(
                        clinicalDF[patientId][clinicalDF[sampleId] == ""])))

        # CHECK: AGE_AT_SEQ_REPORT
        age = "AGE_AT_SEQ_REPORT"
        haveColumn = process_functions.checkColExist(clinicalDF, age)
        if haveColumn:
            # Deal with HIPAA converted rows from DFCI
            # First for loop can't int(text) because there
            # are instances that have <3435
            age_seq_report_df = \
                clinicalDF[~clinicalDF[age].isin(['Unknown'])]

            age_seq_report_df[age] = \
                remove_greaterthan_lessthan_str(age_seq_report_df[age])

            if not all([
                    process_functions.checkInt(i)
                    for i in age_seq_report_df[age]
            ]):
                total_error += (
                    "Sample Clinical File: Please double check your "
                    "AGE_AT_SEQ_REPORT. It must be an integer or 'Unknown'.\n")
            else:
                age_seq_report_df[age] = age_seq_report_df[age].astype(int)
                median_age = pd.np.median(age_seq_report_df[age])
                if median_age < 100:
                    total_error += (
                        "Sample Clinical File: Please double check your "
                        "AGE_AT_SEQ_REPORT. You may be reporting this value "
                        "in YEARS, please report in DAYS.\n")
        else:
            total_error += \
                "Sample Clinical File: Must have AGE_AT_SEQ_REPORT column.\n"

        # CHECK: ONCOTREE_CODE
        haveColumn = \
            process_functions.checkColExist(clinicalDF, "ONCOTREE_CODE")
        maleOncoCodes = ["TESTIS", "PROSTATE", "PENIS"]
        womenOncoCodes = ["CERVIX", "VULVA", "UTERUS", "OVARY"]
        if haveColumn:
            # Make oncotree codes uppercase (SpCC/SPCC)
            clinicalDF['ONCOTREE_CODE'] = \
                clinicalDF['ONCOTREE_CODE'].astype(str).str.upper()

            oncotree_codes = clinicalDF['ONCOTREE_CODE'][
                clinicalDF['ONCOTREE_CODE'] != "UNKNOWN"]

            if not all(oncotree_codes.isin(oncotree_mapping['ONCOTREE_CODE'])):
                unmapped_oncotrees = oncotree_codes[
                    ~oncotree_codes.isin(oncotree_mapping['ONCOTREE_CODE'])]
                total_error += (
                    "Sample Clinical File: Please double check that all your "
                    "ONCOTREE CODES exist in the mapping. You have {} samples "
                    "that don't map. These are the codes that "
                    "don't map: {}\n".format(len(unmapped_oncotrees), ",".join(
                        set(unmapped_oncotrees))))

            if process_functions.checkColExist(clinicalDF, "SEX") and \
               'oncotree_mapping_dict' in locals() and \
               havePatientColumn and \
               haveSampleColumn:

                wrongCodeSamples = []
                # This is to check if oncotree codes match the sex,
                # returns list of samples that have conflicting codes and sex
                for code, patient, sample in zip(clinicalDF['ONCOTREE_CODE'],
                                                 clinicalDF['PATIENT_ID'],
                                                 clinicalDF['SAMPLE_ID']):

                    if oncotree_mapping_dict.get(code) is not None and \
                       sum(clinicalDF['PATIENT_ID'] == patient) > 0:

                        primaryCode = oncotree_mapping_dict[code][
                            'ONCOTREE_PRIMARY_NODE']

                        sex = clinicalDF['SEX'][clinicalDF['PATIENT_ID'] ==
                                                patient].values[0]
                        sex = float('nan') if sex == '' else float(sex)
                        if oncotree_mapping_dict[code][
                                'ONCOTREE_PRIMARY_NODE'] in maleOncoCodes and \
                           sex != 1.0:

                            wrongCodeSamples.append(sample)
                        if oncotree_mapping_dict[code][
                                'ONCOTREE_PRIMARY_NODE'] in womenOncoCodes and\
                           sex != 2.0:

                            wrongCodeSamples.append(sample)
                if len(wrongCodeSamples) > 0:
                    warning += (
                        "Sample Clinical File: Some SAMPLE_IDs have "
                        "conflicting SEX and ONCOTREE_CODES: {}\n".format(
                            ",".join(wrongCodeSamples)))
        else:
            total_error += \
                "Sample Clinical File: Must have ONCOTREE_CODE column.\n"

        warn, error = process_functions.check_col_and_values(
            clinicalDF,
            "SAMPLE_TYPE",
            sampleType_mapping['CODE'].tolist(),
            "Sample Clinical File",
            required=True)
        total_error += error

        # CHECK: SEQ_ASSAY_ID
        haveColumn = \
            process_functions.checkColExist(clinicalDF, "SEQ_ASSAY_ID")
        if haveColumn:
            if not all([i != "" for i in clinicalDF['SEQ_ASSAY_ID']]):
                total_error += (
                    "Sample Clinical File: Please double check your "
                    "SEQ_ASSAY_ID columns, there are empty rows.\n")
            # must remove empty seq assay ids first
            # Checking if seq assay ids start with the center name
            seqAssayIds = \
                clinicalDF.SEQ_ASSAY_ID[clinicalDF.SEQ_ASSAY_ID != ""]
            allSeqAssays = seqAssayIds.unique()
            notNormalized = []
            not_caps = []
            for seqassay in allSeqAssays:
                # SEQ Ids are all capitalized now, so no need to check
                # for differences in case
                if not seqassay.upper().startswith(self.center):
                    not_caps.append(seqassay)
            if len(not_caps) > 0:
                total_error += ("Sample Clinical File: Please make sure your "
                                "SEQ_ASSAY_IDs start with your center "
                                "abbreviation: {}.\n".format(
                                    ", ".join(not_caps)))
        else:
            total_error += \
                "Sample Clinical File: Must have SEQ_ASSAY_ID column.\n"

        haveColumn = process_functions.checkColExist(clinicalDF, "SEQ_DATE")
        seq_date_error = (
            "Sample Clinical File: SEQ_DATE must be one of five values- "
            "For Jan-March: use Jan-YEAR. "
            "For Apr-June: use Apr-YEAR. "
            "For July-Sep: use Jul-YEAR. "
            "For Oct-Dec: use Oct-YEAR. (ie. Apr-2017) "
            "For values that don't have SEQ_DATES that "
            "you want released use 'release'.\n")

        if haveColumn:
            clinicalDF['SEQ_DATE'] = [
                i.title() for i in clinicalDF['SEQ_DATE'].astype(str)
            ]

            seqDate = clinicalDF['SEQ_DATE'][
                clinicalDF['SEQ_DATE'] != 'Release']
            if sum(clinicalDF['SEQ_DATE'] == '') > 0:
                total_error += (
                    "Sample Clinical File: Samples without SEQ_DATEs will "
                    "NOT be released.\n")
            try:
                if not seqDate.empty:
                    dates = seqDate.apply(
                        lambda date: datetime.datetime.strptime(date, '%b-%Y'))
                    # REMOVE JUN LATER
                    if not all([
                            i.startswith(("Jul", "Jan", "Oct", "Apr"))
                            for i in seqDate
                    ]):
                        total_error += seq_date_error
            except ValueError:
                total_error += seq_date_error
        else:
            total_error += "Sample Clinical File: Must have SEQ_DATE column.\n"

        # CHECK: BIRTH_YEAR
        birth_year = "BIRTH_YEAR"
        haveColumn = process_functions.checkColExist(clinicalDF, birth_year)
        if haveColumn:
            birth_year_df = \
                clinicalDF[~clinicalDF[birth_year].isin(['Unknown'])]
            # Deal with HIPAA converted rows from DFCI
            # First for loop can't int(text) because there are
            # instances that have <YYYY
            birth_year_df[birth_year] = \
                remove_greaterthan_lessthan_str(birth_year_df[birth_year])

            try:
                years = birth_year_df[birth_year].apply(
                    lambda x: datetime.datetime.strptime(str(int(
                        x)), '%Y').year > datetime.datetime.utcnow().year)

                assert not years.any()
            except Exception:
                total_error += (
                    "Patient Clinical File: Please double check your "
                    "BIRTH_YEAR column, it must be an integer in YYYY format "
                    "> {year} or 'Unknown'.\n".format(
                        year=datetime.datetime.utcnow().year))
        else:
            total_error += \
                "Patient Clinical File: Must have BIRTH_YEAR column.\n"

        # CHECK: VITAL_STATUS
        # YEAR DEATH
        haveColumn = process_functions.checkColExist(clinicalDF, "YEAR_DEATH")
        if haveColumn:
            notNullYears = clinicalDF.YEAR_DEATH[~clinicalDF.YEAR_DEATH.isin(
                ['Unknown', 'Not Collected', 'Not Applicable'])]
            try:
                notNullYears.apply(
                    lambda x: datetime.datetime.strptime(str(int(x)), '%Y'))
            except Exception:
                total_error += (
                    "Patient Clinical File: Please double check your "
                    "YEAR_DEATH column, it must be an integer in YYYY format, "
                    "'Unknown', 'Not Applicable' or 'Not Collected'.\n")
        else:
            total_error += \
                "Patient Clinical File: Must have YEAR_DEATH column.\n"

        # YEAR CONTACT
        haveColumn = process_functions.checkColExist(clinicalDF,
                                                     "YEAR_CONTACT")
        if haveColumn:
            notNullYears = clinicalDF.YEAR_CONTACT[
                ~clinicalDF.YEAR_CONTACT.isin(['Unknown', 'Not Collected'])]
            try:
                notNullYears.apply(
                    lambda x: datetime.datetime.strptime(str(int(x)), '%Y'))
            except Exception:
                total_error += (
                    "Patient Clinical File: Please double check your "
                    "YEAR_CONTACT column, it must be an integer in YYYY "
                    "format, 'Unknown' or 'Not Collected'.\n")
        else:
            total_error += \
                "Patient Clinical File: Must have YEAR_CONTACT column.\n"

        # INT CONTACT
        haveColumn = process_functions.checkColExist(clinicalDF, "INT_CONTACT")
        if haveColumn:
            if not all([
                    process_functions.checkInt(i)
                    for i in clinicalDF.INT_CONTACT if i not in
                ['>32485', '<6570', 'Unknown', 'Not Collected']
            ]):

                total_error += (
                    "Patient Clinical File: Please double check your "
                    "INT_CONTACT column, it must be an integer, '>32485', "
                    "'<6570', 'Unknown' or 'Not Collected'.\n")
        else:
            total_error += \
                "Patient Clinical File: Must have INT_CONTACT column.\n"

        # INT DOD
        haveColumn = process_functions.checkColExist(clinicalDF, "INT_DOD")
        if haveColumn:
            if not all([
                    process_functions.checkInt(i)
                    for i in clinicalDF.INT_DOD if i not in [
                        '>32485', '<6570', 'Unknown', 'Not Collected',
                        'Not Applicable'
                    ]
            ]):

                total_error += (
                    "Patient Clinical File: Please double check your INT_DOD "
                    "column, it must be an integer, '>32485', '<6570', "
                    "'Unknown', 'Not Collected' or 'Not Applicable'.\n")
        else:
            total_error += \
                "Patient Clinical File: Must have INT_DOD column.\n"

        haveColumn = process_functions.checkColExist(clinicalDF, "DEAD")
        if haveColumn:
            # Need to have check_bool function
            if not all([
                    str(i).upper() in ['TRUE', 'FALSE']
                    for i in clinicalDF.DEAD
                    if i not in ['Unknown', 'Not Collected']
            ]):
                total_error += (
                    "Patient Clinical File: Please double check your "
                    "DEAD column, it must be True, False, 'Unknown' or "
                    "'Not Collected'.\n")
        else:
            total_error += \
                "Patient Clinical File: Must have DEAD column.\n"

        # CHECK: PRIMARY_RACE
        warn, error = process_functions.check_col_and_values(
            clinicalDF, "PRIMARY_RACE", race_mapping['CODE'].tolist(),
            "Patient Clinical File")
        warning += warn
        total_error += error

        # CHECK: SECONDARY_RACE
        warn, error = process_functions.check_col_and_values(
            clinicalDF, "SECONDARY_RACE", race_mapping['CODE'].tolist(),
            "Patient Clinical File")
        warning += warn
        total_error += error

        # CHECK: TERTIARY_RACE
        warn, error = process_functions.check_col_and_values(
            clinicalDF, "TERTIARY_RACE", race_mapping['CODE'].tolist(),
            "Patient Clinical File")
        warning += warn
        total_error += error

        # CHECK: SEX
        warn, error = process_functions.check_col_and_values(
            clinicalDF,
            "SEX",
            sex_mapping['CODE'].tolist(),
            "Patient Clinical File",
            required=True)
        warning += warn
        total_error += error

        # CHECK: ETHNICITY
        warn, error = process_functions.check_col_and_values(
            clinicalDF, "ETHNICITY", ethnicity_mapping['CODE'].tolist(),
            "Patient Clinical File")
        warning += warn
        total_error += error

        return (total_error, warning)
Exemple #9
0
    def process_steps(self, filePath, databaseToSynIdMappingDf, newPath,
                      parentId, oncotreeLink):
        patientSynId = databaseToSynIdMappingDf.Id[
            databaseToSynIdMappingDf['Database'] == "patient"][0]
        sampleSynId = databaseToSynIdMappingDf.Id[
            databaseToSynIdMappingDf['Database'] == "sample"][0]

        clinicalDf = pd.read_csv(filePath, sep="\t", comment="#")

        patient = False
        sample = False
        # These synapse ids for the clinical tier release scope is
        # hardcoded because it never changes
        patientColsTable = self.syn.tableQuery(
            'select fieldName from syn8545211 where patient is '
            'True and inClinicalDb is True')
        patientCols = patientColsTable.asDataFrame()['fieldName'].tolist()
        sampleColsTable = self.syn.tableQuery(
            'select fieldName from syn8545211 where sample is True '
            'and inClinicalDb is True')
        sampleCols = sampleColsTable.asDataFrame()['fieldName'].tolist()

        if "patient" in filePath.lower():
            clinicalTemplate = pd.DataFrame(columns=patientCols)
            patient = True
        elif "sample" in filePath.lower():
            clinicalTemplate = pd.DataFrame(columns=sampleCols)
            sample = True
        else:
            clinicalTemplate = pd.DataFrame(columns=set(patientCols +
                                                        sampleCols))
            sample = True
            patient = True

        newClinicalDf = self._process(clinicalDf, clinicalTemplate)

        if patient:
            patientClinical = newClinicalDf[patientCols].drop_duplicates(
                "PATIENT_ID")
            self.uploadMissingData(patientClinical, "PATIENT_ID", patientSynId,
                                   parentId)
            # retractedPatientSynId)
            process_functions.updateData(self.syn,
                                         patientSynId,
                                         patientClinical,
                                         self.center,
                                         col=patientCols,
                                         toDelete=True)
        if sample:
            if sum(newClinicalDf["SAMPLE_ID"].duplicated()) > 0:
                logger.error("There are duplicated samples, "
                             "and the duplicates are removed")
            sampleClinical = newClinicalDf[sampleCols].drop_duplicates(
                "SAMPLE_ID")
            # Exclude all clinical samples with wrong oncotree codes
            oncotree_mapping = pd.DataFrame()
            oncotree_mapping_dict = \
                process_functions.get_oncotree_code_mappings(oncotreeLink)
            # Add in unknown key for oncotree code
            oncotree_mapping_dict['UNKNOWN'] = {}
            oncotree_mapping['ONCOTREE_CODE'] = oncotree_mapping_dict.keys()
            # Make oncotree codes uppercase (SpCC/SPCC)
            sampleClinical['ONCOTREE_CODE'] = sampleClinical[
                'ONCOTREE_CODE'].astype(str).str.upper()
            sampleClinical = sampleClinical[
                sampleClinical['ONCOTREE_CODE'].isin(
                    oncotree_mapping['ONCOTREE_CODE'])]
            self.uploadMissingData(sampleClinical, "SAMPLE_ID", sampleSynId,
                                   parentId)
            # ,retractedSampleSynId)
            process_functions.updateData(self.syn,
                                         sampleSynId,
                                         sampleClinical,
                                         self.center,
                                         col=sampleCols,
                                         toDelete=True)

        newClinicalDf.to_csv(newPath, sep="\t", index=False)
        return (newPath)
def update_oncotree_code_tables(syn, database_mappingdf):
    """
    Updates database statistics of oncotree codes
    and primary onocotree codes

    Args:
        syn: synapse object
        database_mappingdf: mapping between synapse ids and database
    """
    oncotree_distribution_synid = database_mappingdf["Id"][
        database_mappingdf["Database"] == "oncotree"].values[0]

    clinical = syn.tableQuery("select * from syn7517674")
    clinicaldf = clinical.asDataFrame()

    # DISTRIBUTION OF ONCOTREE CODE TABLE UPDATE
    oncotree_code_distributiondf = pd.DataFrame(
        columns=set(clinicaldf["CENTER"]),
        index=set(clinicaldf["ONCOTREE_CODE"]))
    for center in oncotree_code_distributiondf.columns:
        onc_counts = clinicaldf["ONCOTREE_CODE"][clinicaldf["CENTER"] ==
                                                 center].value_counts()
        oncotree_code_distributiondf[center] = onc_counts
    oncotree_code_distributiondf = oncotree_code_distributiondf.fillna(0)
    oncotree_code_distributiondf = oncotree_code_distributiondf.applymap(int)
    oncotree_code_distributiondf["Total"] = oncotree_code_distributiondf.apply(
        sum, axis=1)
    oncotree_code_distributiondf[
        "Oncotree_Code"] = oncotree_code_distributiondf.index

    oncotree_distribution_db = syn.tableQuery("SELECT %s FROM %s" % (
        "Oncotree_Code," + ",".join(clinicaldf["CENTER"].unique()) + ",Total",
        oncotree_distribution_synid,
    ))

    oncotree_distribution_dbdf = oncotree_distribution_db.asDataFrame()
    process_functions.updateDatabase(
        syn,
        oncotree_distribution_dbdf,
        oncotree_code_distributiondf,
        oncotree_distribution_synid,
        ["Oncotree_Code"],
        to_delete=True,
    )

    # DISTRIBUTION OF PRIMARY CODE TABLE UPDATE
    oncotree_link_synid = database_mappingdf["Id"][
        database_mappingdf["Database"] == "oncotreeLink"].values[0]
    primary_code_synId = database_mappingdf["Id"][
        database_mappingdf["Database"] == "primaryCode"].values[0]

    # Can also use most up to date oncotree code,
    # because these tables are updated from the database
    oncotree_link_ent = syn.get(oncotree_link_synid)
    oncotree_link = oncotree_link_ent.externalURL
    oncotree_mapping = process_functions.get_oncotree_code_mappings(
        oncotree_link)

    clinicaldf["PRIMARY_CODES"] = [
        oncotree_mapping[i.upper()]["ONCOTREE_PRIMARY_NODE"]
        if i.upper() in oncotree_mapping.keys() else "DEPRECATED_CODE"
        for i in clinicaldf.ONCOTREE_CODE
    ]

    # ### DISTRIBUTION OF PRIMARY ONCOTREE CODE TABLE UPDATE
    primary_code_distributiondf = pd.DataFrame(
        columns=set(clinicaldf["CENTER"]),
        index=set(clinicaldf["PRIMARY_CODES"]))

    for center in primary_code_distributiondf.columns:
        onc_counts = clinicaldf["PRIMARY_CODES"][clinicaldf["CENTER"] ==
                                                 center].value_counts()
        primary_code_distributiondf[center] = onc_counts
    primary_code_distributiondf = primary_code_distributiondf.fillna(0)
    primary_code_distributiondf = primary_code_distributiondf.applymap(int)
    primary_code_distributiondf["Total"] = primary_code_distributiondf.apply(
        sum, axis=1)
    primary_code_distributiondf[
        "Oncotree_Code"] = primary_code_distributiondf.index

    primary_code_dist_db = syn.tableQuery("SELECT %s FROM %s" % (
        "Oncotree_Code," + ",".join(clinicaldf["CENTER"].unique()) + ",Total",
        primary_code_synId,
    ))

    primary_code_dist_dbdf = primary_code_dist_db.asDataFrame()
    process_functions.updateDatabase(
        syn,
        primary_code_dist_dbdf,
        primary_code_distributiondf,
        primary_code_synId,
        ["Oncotree_Code"],
        to_delete=True,
    )