Esempio n. 1
0
    def _validate(self, vitalStatusDf):
        total_error = ""
        warning = ""

        #PATIENT ID
        haveColumn = process_functions.checkColExist(vitalStatusDf, "PATIENT_ID")
        if haveColumn:
            if vitalStatusDf.PATIENT_ID.isnull().any():
                total_error += "Vital status file: Please double check your PATIENT_ID column. No null values allowed.\n"
        else:
            total_error += "Vital status file: Must have PATIENT_ID column.\n"

        #YEAR DEATH
        haveColumn = process_functions.checkColExist(vitalStatusDf, "YEAR_DEATH")
        if haveColumn:
            notNullYears = vitalStatusDf.YEAR_DEATH[~vitalStatusDf.YEAR_DEATH.isnull()]
            try:
                notNullYears.apply(lambda x: datetime.datetime.strptime(str(int(x)), '%Y'))
            except:
                total_error += "Vital status file: Please double check your YEAR_DEATH column, it must be an integer in YYYY format or an empty string.\n"
        else:
            total_error += "Vital status file: Must have YEAR_DEATH column.\n"

        #YEAR CONTACT
        haveColumn = process_functions.checkColExist(vitalStatusDf, "YEAR_CONTACT")
        if haveColumn:
            notNullYears = vitalStatusDf.YEAR_CONTACT[~vitalStatusDf.YEAR_CONTACT.isnull()]
            try:
                notNullYears.apply(lambda x: datetime.datetime.strptime(str(int(x)), '%Y'))
            except:
                total_error += "Vital status file: Please double check your YEAR_CONTACT column, it must be an integer in YYYY format or an empty string.\n"
        else:
            total_error += "Vital status file: Must have YEAR_CONTACT column.\n"

        #INT CONTACT
        haveColumn = process_functions.checkColExist(vitalStatusDf, "INT_CONTACT")
        if haveColumn:
            #notNullContact = vitalStatusDf.INT_CONTACT[~vitalStatusDf.INT_CONTACT.isnull()]
            if not all([process_functions.checkInt(i) for i in vitalStatusDf.INT_CONTACT if not pd.isnull(i) and i not in ['>32485','<6570']]):
                total_error += "Vital status file: Please double check your INT_CONTACT column, it must be an integer, an empty string, >32485, or <6570.\n"
        else:
            total_error += "Vital status file: Must have INT_CONTACT column.\n"

        #INT DOD
        haveColumn = process_functions.checkColExist(vitalStatusDf, "INT_DOD")
        if haveColumn:
            if not all([process_functions.checkInt(i) for i in vitalStatusDf.INT_DOD if not pd.isnull(i) and i not in ['>32485','<6570']]):
                total_error += "Vital status file: Please double check your INT_DOD column, it must be an integer, an empty string, >32485, or <6570.\n"
        else:
            total_error += "Vital status file: Must have INT_DOD column.\n"

        haveColumn = process_functions.checkColExist(vitalStatusDf, "DEAD")
        if haveColumn:
            if not all([isinstance(i, bool) for i in vitalStatusDf.DEAD if not pd.isnull(i)]):
                total_error += "Vital status file: Please double check your DEAD column, it must be a boolean value or an empty string.\n"
        else:
            total_error += "Vital status file: Must have DEAD column.\n"

        return(total_error, warning)
Esempio n. 2
0
    def update_clinical(self, row):
        """Transform the values of each row of the clinical file"""
        # Must create copy or else it will overwrite the original row
        x = row.copy()
        # # PATIENT ID
        if x.get("PATIENT_ID") is not None:
            x["PATIENT_ID"] = process_functions.checkGenieId(
                x["PATIENT_ID"], self.center
            )

        # BIRTH YEAR
        if x.get("BIRTH_YEAR") is not None:
            # BIRTH YEAR (Check if integer)
            if process_functions.checkInt(x["BIRTH_YEAR"]):
                x["BIRTH_YEAR"] = int(x["BIRTH_YEAR"])

        # SAMPLE ID
        if x.get("SAMPLE_ID") is not None:
            x["SAMPLE_ID"] = process_functions.checkGenieId(x["SAMPLE_ID"], self.center)

        # AGE AT SEQ REPORT
        if x.get("AGE_AT_SEQ_REPORT") is not None:
            if process_functions.checkInt(x["AGE_AT_SEQ_REPORT"]):
                x["AGE_AT_SEQ_REPORT"] = int(x["AGE_AT_SEQ_REPORT"])

        # SEQ ASSAY ID
        if x.get("SEQ_ASSAY_ID") is not None:
            x["SEQ_ASSAY_ID"] = x["SEQ_ASSAY_ID"].replace("_", "-")
            # standardize all SEQ_ASSAY_ID with uppercase
            x["SEQ_ASSAY_ID"] = x["SEQ_ASSAY_ID"].upper()

        if x.get("SEQ_DATE") is not None:
            x["SEQ_DATE"] = x["SEQ_DATE"].title()
            x["SEQ_YEAR"] = (
                int(str(x["SEQ_DATE"]).split("-")[1])
                if str(x["SEQ_DATE"]) != "Release"
                else float("nan")
            )

        if x.get("YEAR_CONTACT") is not None:
            if process_functions.checkInt(x["YEAR_CONTACT"]):
                x["YEAR_CONTACT"] = int(x["YEAR_CONTACT"])

        if x.get("YEAR_DEATH") is not None:
            if process_functions.checkInt(x["YEAR_DEATH"]):
                x["YEAR_DEATH"] = int(x["YEAR_DEATH"])

        # TRIM EVERY COLUMN MAKE ALL DASHES
        for i in x.keys():
            if isinstance(x[i], str):
                x[i] = x[i].strip(" ")
        return x
Esempio n. 3
0
    def _validate(self, assay_info_df):
        '''
        Validates the values of assay information file

        Args:
            assay_info_df: assay information dataframe

        Returns:
            tuple: error and warning
        '''

        total_error = ""
        warning = ""

        if process_functions.checkColExist(assay_info_df, "SEQ_ASSAY_ID"):
            all_seq_assays = assay_info_df.SEQ_ASSAY_ID.unique()
            if not all(
                [assay.startswith(self.center) for assay in all_seq_assays]):
                total_error += \
                    "Assay_information.yaml: Please make sure your all your" +\
                    " SEQ_ASSAY_IDs start with your center abbreviation.\n"
        else:
            total_error += \
                "Assay_information.yaml: Must have SEQ_ASSAY_ID column.\n"

        read_group_dict = process_functions.get_gdc_data_dictionary(
            "read_group")
        read_group_headers = read_group_dict['properties']

        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            'is_paired_end', [True, False],
            filename="Assay_information.yaml",
            required=True)
        warning += warn
        total_error += error

        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            'library_selection',
            read_group_headers['library_selection']['enum'],
            filename="Assay_information.yaml",
            required=True)

        warning += warn
        total_error += error
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            'library_strategy',
            read_group_headers['library_strategy']['enum'],
            filename="Assay_information.yaml",
            required=True)

        warning += warn
        total_error += error
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            'platform',
            read_group_headers['platform']['enum'],
            filename="Assay_information.yaml",
            required=True)

        warning += warn
        total_error += error

        instrument_model = read_group_headers['instrument_model']['enum']
        instrument_model.append(None)
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            'instrument_model',
            instrument_model,
            filename="Assay_information.yaml",
            required=True)

        warning += warn
        total_error += error

        variant_classes = \
            ['Splice_Site', 'Nonsense_Mutation', 'Frame_Shift_Del',
             'Frame_Shift_Ins', 'Nonstop_Mutation', 'Translation_Start_Site',
             'In_Frame_Ins', 'In_Frame_Del', 'Missense_Mutation',
             'Intron', 'Splice_Region', 'Silent', 'RNA', "5'UTR", "3'UTR",
             'IGR', "5'Flank", "3'Flank", None]
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            'variant_classifications',
            variant_classes,
            filename="Assay_information.yaml",
            na_allowed=True)

        warning += warn
        total_error += error

        # if not process_functions.checkColExist(
        #         assay_info_df, "target_capture_kit"):
        #     total_error += ("Assay_information.yaml: "
        #                     "Must have target_capture_kit column.\n")

        if process_functions.checkColExist(assay_info_df, "read_length"):
            if not all([
                    process_functions.checkInt(i)
                    for i in assay_info_df["read_length"]
                    if i is not None and not pd.isnull(i)
            ]):
                total_error += \
                    ("Assay_information.yaml: "
                     "Please double check your read_length.  "
                     "It must be an integer or null.\n")
        else:
            total_error += \
                ("Assay_information.yaml: "
                 "Must have read_length column.\n")

        if process_functions.checkColExist(assay_info_df, "number_of_genes"):
            if not all([
                    process_functions.checkInt(i)
                    for i in assay_info_df["number_of_genes"]
            ]):
                total_error += \
                    ("Assay_information.yaml: "
                     "Please double check your number_of_genes. "
                     "It must be an integer.\n")
        else:
            total_error += \
                ("Assay_information.yaml: "
                 "Must have number_of_genes column.\n")

        if process_functions.checkColExist(assay_info_df, "gene_padding"):
            if not all([
                    process_functions.checkInt(i)
                    for i in assay_info_df["gene_padding"]
                    if i is not None and not pd.isnull(i)
            ]):
                total_error += \
                    ("Assay_information.yaml: "
                     "Please double check your gene_padding. "
                     "It must be an integer or blank.\n")
        else:
            warning += \
                ("Assay_information.yaml: "
                 "gene_padding is by default 10 if not specified.\n")

        return (total_error, warning)
Esempio n. 4
0
    def _validate(self, clinicaldf, oncotree_link):
        """
        This function validates the clinical file to make sure it adhere
        to the clinical SOP.

        Args:
            clinicalDF: Merged clinical file with patient and sample
                        information
            oncotree_link: Link to oncotree

        Returns:
            Error message
        """
        total_error = StringIO()
        warning = StringIO()

        clinicaldf.columns = [col.upper() for col in clinicaldf.columns]
        # CHECK: for empty rows
        empty_rows = clinicaldf.isnull().values.all(axis=1)
        if empty_rows.any():
            total_error.write("Clinical file(s): No empty rows allowed.\n")
            # Remove completely empty rows to speed up processing
            clinicaldf = clinicaldf[~empty_rows]

        clinicaldf = clinicaldf.fillna("")

        oncotree_mapping_dict = process_functions.get_oncotree_code_mappings(
            oncotree_link
        )
        oncotree_mapping = pd.DataFrame(
            {"ONCOTREE_CODE": list(oncotree_mapping_dict.keys())}
        )

        sampletype_mapping = process_functions.getGenieMapping(self.syn, "syn7434273")

        ethnicity_mapping = process_functions.getGenieMapping(self.syn, "syn7434242")

        race_mapping = process_functions.getGenieMapping(self.syn, "syn7434236")

        sex_mapping = process_functions.getGenieMapping(self.syn, "syn7434222")

        # CHECK: SAMPLE_ID
        sample_id = "SAMPLE_ID"
        haveSampleColumn = process_functions.checkColExist(clinicaldf, sample_id)

        if not haveSampleColumn:
            total_error.write("Sample Clinical File: Must have SAMPLE_ID column.\n")
        else:
            if sum(clinicaldf[sample_id].duplicated()) > 0:
                total_error.write(
                    "Sample Clinical File: No duplicated SAMPLE_ID "
                    "allowed.\nIf there are no duplicated "
                    "SAMPLE_IDs, and both sample and patient files are "
                    "uploaded, then please check to make sure no duplicated "
                    "PATIENT_IDs exist in the patient clinical file.\n"
                )
        # CHECK: PATIENT_ID
        patientId = "PATIENT_ID"
        # #CHECK: PATIENT_ID IN SAMPLE FILE
        havePatientColumn = process_functions.checkColExist(clinicaldf, patientId)

        if not havePatientColumn:
            total_error.write("Patient Clinical File: Must have PATIENT_ID column.\n")

        # CHECK: within the sample file that the sample ids match
        # the patient ids
        if haveSampleColumn and havePatientColumn:
            # Make sure sample and patient ids are string cols
            clinicaldf[sample_id] = clinicaldf[sample_id].astype(str)
            clinicaldf[patientId] = clinicaldf[patientId].astype(str)
            if not all(
                [
                    patient in sample
                    for sample, patient in zip(
                        clinicaldf[sample_id], clinicaldf[patientId]
                    )
                ]
            ):

                total_error.write(
                    "Sample Clinical File: PATIENT_ID's much be contained in "
                    "the SAMPLE_ID's (ex. SAGE-1 <-> SAGE-1-2)\n"
                )
            # #CHECK: All samples must have associated patient data
            # (GENIE requires patient data)
            if not all(clinicaldf[patientId] != ""):
                total_error.write(
                    "Patient Clinical File: All samples must have associated "
                    "patient information and no null patient ids allowed. "
                    "These samples are missing patient data: {}\n".format(
                        ", ".join(
                            clinicaldf[sample_id][clinicaldf[patientId] == ""].unique()
                        )
                    )
                )

            # CHECK: All patients should have associated sample data
            if not all(clinicaldf[sample_id] != ""):
                # ## MAKE WARNING FOR NOW###
                warning.write(
                    "Sample Clinical File: All patients must have associated "
                    "sample information. These patients are missing sample "
                    "data: {}\n".format(
                        ", ".join(
                            clinicaldf[patientId][clinicaldf[sample_id] == ""].unique()
                        )
                    )
                )

        # CHECK: AGE_AT_SEQ_REPORT
        age = "AGE_AT_SEQ_REPORT"
        haveColumn = process_functions.checkColExist(clinicaldf, age)
        if haveColumn:
            # Deal with HIPAA converted rows from DFCI
            # First for loop can't int(text) because there
            # are instances that have <3435
            age_seq_report_df = clinicaldf[
                ~clinicaldf[age].isin(["Unknown", ">32485", "<6570"])
            ]

            # age_seq_report_df[age] = \
            #     remove_greaterthan_lessthan_str(age_seq_report_df[age])

            if not all([process_functions.checkInt(i) for i in age_seq_report_df[age]]):
                total_error.write(
                    "Sample Clinical File: Please double check your "
                    "AGE_AT_SEQ_REPORT. It must be an integer, 'Unknown', "
                    "'>32485', '<6570'.\n"
                )
            else:
                age_seq_report_df[age] = age_seq_report_df[age].astype(int)
                median_age = age_seq_report_df[age].median()
                if median_age < 100:
                    total_error.write(
                        "Sample Clinical File: Please double check your "
                        "AGE_AT_SEQ_REPORT. You may be reporting this value "
                        "in YEARS, please report in DAYS.\n"
                    )
        else:
            total_error.write(
                "Sample Clinical File: Must have AGE_AT_SEQ_REPORT column.\n"
            )

        # CHECK: ONCOTREE_CODE
        haveColumn = process_functions.checkColExist(clinicaldf, "ONCOTREE_CODE")
        maleOncoCodes = ["TESTIS", "PROSTATE", "PENIS"]
        womenOncoCodes = ["CERVIX", "VULVA", "UTERUS", "OVARY"]
        if haveColumn:
            # Make oncotree codes uppercase (SpCC/SPCC)
            clinicaldf["ONCOTREE_CODE"] = (
                clinicaldf["ONCOTREE_CODE"].astype(str).str.upper()
            )

            oncotree_codes = clinicaldf["ONCOTREE_CODE"][
                clinicaldf["ONCOTREE_CODE"] != "UNKNOWN"
            ]

            if not all(oncotree_codes.isin(oncotree_mapping["ONCOTREE_CODE"])):
                unmapped_oncotrees = oncotree_codes[
                    ~oncotree_codes.isin(oncotree_mapping["ONCOTREE_CODE"])
                ]
                total_error.write(
                    "Sample Clinical File: Please double check that all your "
                    "ONCOTREE CODES exist in the mapping. You have {} samples "
                    "that don't map. These are the codes that "
                    "don't map: {}\n".format(
                        len(unmapped_oncotrees),
                        ",".join(set(unmapped_oncotrees)),
                    )
                )
            # Should add the SEX mismatch into the dashboard file
            if (
                process_functions.checkColExist(clinicaldf, "SEX")
                and "oncotree_mapping_dict" in locals()
                and havePatientColumn
                and haveSampleColumn
            ):

                wrongCodeSamples = []
                # This is to check if oncotree codes match the sex,
                # returns list of samples that have conflicting codes and sex
                for code, patient, sample in zip(
                    clinicaldf["ONCOTREE_CODE"],
                    clinicaldf["PATIENT_ID"],
                    clinicaldf["SAMPLE_ID"],
                ):

                    if (
                        oncotree_mapping_dict.get(code) is not None
                        and sum(clinicaldf["PATIENT_ID"] == patient) > 0
                    ):

                        primaryCode = oncotree_mapping_dict[code][
                            "ONCOTREE_PRIMARY_NODE"
                        ]

                        sex = clinicaldf["SEX"][
                            clinicaldf["PATIENT_ID"] == patient
                        ].values[0]
                        sex = float("nan") if sex == "" else float(sex)
                        if (
                            oncotree_mapping_dict[code]["ONCOTREE_PRIMARY_NODE"]
                            in maleOncoCodes
                            and sex != 1.0
                        ):

                            wrongCodeSamples.append(sample)
                        if (
                            oncotree_mapping_dict[code]["ONCOTREE_PRIMARY_NODE"]
                            in womenOncoCodes
                            and sex != 2.0
                        ):

                            wrongCodeSamples.append(sample)
                if len(wrongCodeSamples) > 0:
                    warning.write(
                        "Sample Clinical File: Some SAMPLE_IDs have "
                        "conflicting SEX and ONCOTREE_CODES: {}\n".format(
                            ",".join(wrongCodeSamples)
                        )
                    )
        else:
            total_error.write("Sample Clinical File: Must have ONCOTREE_CODE column.\n")

        warn, error = process_functions.check_col_and_values(
            clinicaldf,
            "SAMPLE_TYPE",
            sampletype_mapping["CODE"].tolist(),
            "Sample Clinical File",
            required=True,
        )
        total_error.write(error)

        # CHECK: SEQ_ASSAY_ID
        haveColumn = process_functions.checkColExist(clinicaldf, "SEQ_ASSAY_ID")
        if haveColumn:
            if not all([i != "" for i in clinicaldf["SEQ_ASSAY_ID"]]):
                total_error.write(
                    "Sample Clinical File: Please double check your "
                    "SEQ_ASSAY_ID columns, there are empty rows.\n"
                )
            # must remove empty seq assay ids first
            # Checking if seq assay ids start with the center name
            empty_seq_idx = clinicaldf.SEQ_ASSAY_ID != ""
            seqassay_ids = clinicaldf.SEQ_ASSAY_ID[empty_seq_idx]
            uniq_seqassay_ids = seqassay_ids.unique()
            invalid_seqassay = []
            for seqassay in uniq_seqassay_ids:
                # SEQ Ids are all capitalized now, so no need to check
                # for differences in case
                if not seqassay.upper().startswith(self.center):
                    invalid_seqassay.append(seqassay)
            if invalid_seqassay:
                total_error.write(
                    "Sample Clinical File: Please make sure your "
                    "SEQ_ASSAY_IDs start with your center "
                    "abbreviation: {}.\n".format(", ".join(invalid_seqassay))
                )
        else:
            total_error.write("Sample Clinical File: Must have SEQ_ASSAY_ID column.\n")

        haveColumn = process_functions.checkColExist(clinicaldf, "SEQ_DATE")
        seq_date_error = (
            "Sample Clinical File: SEQ_DATE must be one of five values- "
            "For Jan-March: use Jan-YEAR. "
            "For Apr-June: use Apr-YEAR. "
            "For July-Sep: use Jul-YEAR. "
            "For Oct-Dec: use Oct-YEAR. (ie. Apr-2017) "
            "For values that don't have SEQ_DATES that "
            "you want released use 'release'.\n"
        )

        if haveColumn:
            clinicaldf["SEQ_DATE"] = [
                i.title() for i in clinicaldf["SEQ_DATE"].astype(str)
            ]

            seqdate = clinicaldf["SEQ_DATE"][clinicaldf["SEQ_DATE"] != "Release"]
            if sum(clinicaldf["SEQ_DATE"] == "") > 0:
                total_error.write(
                    "Sample Clinical File: Samples without SEQ_DATEs will "
                    "NOT be released.\n"
                )
            try:
                if not seqdate.empty:
                    seqdate.apply(
                        lambda date: datetime.datetime.strptime(date, "%b-%Y")
                    )
                    if not seqdate.str.startswith(("Jan", "Apr", "Jul", "Oct")).all():
                        total_error.write(seq_date_error)
            except ValueError:
                total_error.write(seq_date_error)
        else:
            total_error.write("Sample Clinical File: Must have SEQ_DATE column.\n")

        # CHECK: BIRTH_YEAR
        error = _check_year(
            clinicaldf=clinicaldf,
            year_col="BIRTH_YEAR",
            filename="Patient Clinical File",
            allowed_string_values=["Unknown", ">89", "<18"],
        )
        total_error.write(error)

        # CHECK: YEAR DEATH
        error = _check_year(
            clinicaldf=clinicaldf,
            year_col="YEAR_DEATH",
            filename="Patient Clinical File",
            allowed_string_values=[
                "Unknown",
                "Not Collected",
                "Not Applicable",
                "Not Released",
                ">89",
                "<18",
            ],
        )
        total_error.write(error)

        # CHECK: YEAR CONTACT
        error = _check_year(
            clinicaldf=clinicaldf,
            year_col="YEAR_CONTACT",
            filename="Patient Clinical File",
            allowed_string_values=[
                "Unknown",
                "Not Collected",
                "Not Released",
                ">89",
                "<18",
            ],
        )
        total_error.write(error)

        # CHECK: INT CONTACT
        haveColumn = process_functions.checkColExist(clinicaldf, "INT_CONTACT")
        if haveColumn:
            if not all(
                [
                    process_functions.checkInt(i)
                    for i in clinicaldf.INT_CONTACT
                    if i
                    not in [
                        ">32485",
                        "<6570",
                        "Unknown",
                        "Not Collected",
                        "Not Released",
                    ]
                ]
            ):

                total_error.write(
                    "Patient Clinical File: Please double check your "
                    "INT_CONTACT column, it must be an integer, '>32485', "
                    "'<6570', 'Unknown', 'Not Released' or 'Not Collected'.\n"
                )
        else:
            total_error.write("Patient Clinical File: Must have INT_CONTACT column.\n")

        # INT DOD
        haveColumn = process_functions.checkColExist(clinicaldf, "INT_DOD")
        if haveColumn:
            if not all(
                [
                    process_functions.checkInt(i)
                    for i in clinicaldf.INT_DOD
                    if i
                    not in [
                        ">32485",
                        "<6570",
                        "Unknown",
                        "Not Collected",
                        "Not Applicable",
                        "Not Released",
                    ]
                ]
            ):

                total_error.write(
                    "Patient Clinical File: Please double check your INT_DOD "
                    "column, it must be an integer, '>32485', '<6570', "
                    "'Unknown', 'Not Collected', 'Not Released' or "
                    "'Not Applicable'.\n"
                )
        else:
            total_error.write("Patient Clinical File: Must have INT_DOD column.\n")

        haveColumn = process_functions.checkColExist(clinicaldf, "DEAD")
        if haveColumn:
            # Need to have check_bool function
            if not all(
                [
                    str(i).upper() in ["TRUE", "FALSE"]
                    for i in clinicaldf.DEAD
                    if i not in ["Unknown", "Not Collected", "Not Released"]
                ]
            ):
                total_error.write(
                    "Patient Clinical File: Please double check your "
                    "DEAD column, it must be True, False, 'Unknown', "
                    "'Not Released' or 'Not Collected'.\n"
                )
        else:
            total_error.write("Patient Clinical File: Must have DEAD column.\n")
        # CHECK: contact vital status value consistency
        contact_error = _check_int_year_consistency(
            clinicaldf=clinicaldf,
            cols=["YEAR_CONTACT", "INT_CONTACT"],
            string_vals=["Not Collected", "Unknown", "Not Released"],
        )
        total_error.write(contact_error)

        # CHECK: death vital status value consistency
        death_error = _check_int_year_consistency(
            clinicaldf=clinicaldf,
            cols=["YEAR_DEATH", "INT_DOD"],
            string_vals=[
                "Not Collected",
                "Unknown",
                "Not Applicable",
                "Not Released",
            ],
        )
        total_error.write(death_error)
        death_error = _check_int_dead_consistency(clinicaldf=clinicaldf)
        total_error.write(death_error)

        # CHECK: SAMPLE_CLASS is optional attribute
        have_column = process_functions.checkColExist(clinicaldf, "SAMPLE_CLASS")
        if have_column:
            sample_class_vals = pd.Series(clinicaldf["SAMPLE_CLASS"].unique().tolist())
            if not sample_class_vals.isin(["Tumor", "cfDNA"]).all():
                total_error.write(
                    "Sample Clinical File: SAMPLE_CLASS column must "
                    "be 'Tumor', or 'cfDNA'\n"
                )

        # CHECK: PRIMARY_RACE
        warn, error = process_functions.check_col_and_values(
            clinicaldf,
            "PRIMARY_RACE",
            race_mapping["CODE"].tolist(),
            "Patient Clinical File",
        )
        warning.write(warn)
        total_error.write(error)

        # CHECK: SECONDARY_RACE
        warn, error = process_functions.check_col_and_values(
            clinicaldf,
            "SECONDARY_RACE",
            race_mapping["CODE"].tolist(),
            "Patient Clinical File",
        )
        warning.write(warn)
        total_error.write(error)

        # CHECK: TERTIARY_RACE
        warn, error = process_functions.check_col_and_values(
            clinicaldf,
            "TERTIARY_RACE",
            race_mapping["CODE"].tolist(),
            "Patient Clinical File",
        )
        warning.write(warn)
        total_error.write(error)

        # CHECK: SEX
        warn, error = process_functions.check_col_and_values(
            clinicaldf,
            "SEX",
            sex_mapping["CODE"].tolist(),
            "Patient Clinical File",
            required=True,
        )
        warning.write(warn)
        total_error.write(error)

        # CHECK: ETHNICITY
        warn, error = process_functions.check_col_and_values(
            clinicaldf,
            "ETHNICITY",
            ethnicity_mapping["CODE"].tolist(),
            "Patient Clinical File",
        )
        warning.write(warn)
        total_error.write(error)

        return total_error.getvalue(), warning.getvalue()
Esempio n. 5
0
    def update_clinical(self, x, sex_mapping, race_mapping, ethnicity_mapping,
                        sample_type):
        # PATIENT ID
        if x.get("PATIENT_ID") is not None:
            x['PATIENT_ID'] = process_functions.checkGenieId(
                x['PATIENT_ID'], self.center)
        # RACE
        if x.get('PRIMARY_RACE') is not None:
            x['PRIMARY_RACE'] = process_functions.getCODE(
                race_mapping, x['PRIMARY_RACE'])
        else:
            x['PRIMARY_RACE'] = "Not Collected"

        if x.get('SECONDARY_RACE') is not None:
            x['SECONDARY_RACE'] = process_functions.getCODE(
                race_mapping, x['SECONDARY_RACE'])
        else:
            x['SECONDARY_RACE'] = "Not Collected"

        if x.get('TERTIARY_RACE') is not None:
            x['TERTIARY_RACE'] = process_functions.getCODE(
                race_mapping, x['TERTIARY_RACE'])
        else:
            x['TERTIARY_RACE'] = "Not Collected"
        # ETHNICITY
        if x.get('ETHNICITY') is not None:
            x['ETHNICITY'] = process_functions.getCODE(ethnicity_mapping,
                                                       x['ETHNICITY'])
        else:
            x['ETHNICITY'] = "Not Collected"
        # BIRTH YEAR
        if x.get("BIRTH_YEAR") is not None:
            # BIRTH YEAR (Check if integer)
            if process_functions.checkInt(x['BIRTH_YEAR']):
                x['BIRTH_YEAR'] = int(x['BIRTH_YEAR'])
        # SEX
        if x.get("SEX") is not None:
            x['SEX'] = process_functions.getCODE(sex_mapping, x['SEX'])
        # TRIM EVERY COLUMN MAKE ALL DASHES
        # SAMPLE ID
        if x.get('SAMPLE_ID') is not None:
            x['SAMPLE_ID'] = process_functions.checkGenieId(
                x['SAMPLE_ID'], self.center)
        # AGE AT SEQ REPORT
        if x.get('AGE_AT_SEQ_REPORT') is not None:
            if process_functions.checkInt(x['AGE_AT_SEQ_REPORT']):
                x['AGE_AT_SEQ_REPORT'] = int(x['AGE_AT_SEQ_REPORT'])

        # SEQ ASSAY ID
        if x.get('SEQ_ASSAY_ID') is not None:
            x['SEQ_ASSAY_ID'] = x['SEQ_ASSAY_ID'].replace('_', '-')
            # standardize all SEQ_ASSAY_ID with uppercase
            x['SEQ_ASSAY_ID'] = x['SEQ_ASSAY_ID'].upper()

        # SAMPLE_TYPE
        if x.get('SAMPLE_TYPE') is not None:
            sampleType = x['SAMPLE_TYPE']
            x['SAMPLE_TYPE'] = process_functions.getCODE(
                sample_type, sampleType)
            # Trim spaces
            x['SAMPLE_TYPE_DETAILED'] = process_functions.getCODE(
                sample_type, sampleType, useDescription=True)

        if x.get('SEQ_DATE') is not None:
            x['SEQ_DATE'] = x['SEQ_DATE'].title()
            x['SEQ_YEAR'] = \
                int(str(x['SEQ_DATE']).split("-")[1]) \
                if str(x['SEQ_DATE']) != "Release" else pd.np.nan

        if x.get('YEAR_CONTACT') is None:
            x['YEAR_CONTACT'] = 'Not Collected'
        else:
            if process_functions.checkInt(x['YEAR_CONTACT']):
                x['YEAR_CONTACT'] = int(x['YEAR_CONTACT'])

        if x.get('YEAR_DEATH') is None:
            x['YEAR_DEATH'] = 'Not Collected'
        else:
            if process_functions.checkInt(x['YEAR_DEATH']):
                x['YEAR_DEATH'] = int(x['YEAR_DEATH'])

        if x.get('INT_CONTACT') is None:
            x['INT_CONTACT'] = 'Not Collected'

        if x.get('INT_DOD') is None:
            x['INT_DOD'] = 'Not Collected'

        if x.get('DEAD') is None:
            x['DEAD'] = 'Not Collected'

        # TRIM EVERY COLUMN MAKE ALL DASHES
        for i in x.keys():
            if isinstance(x[i], str):
                x[i] = x[i].strip(" ")
        return (x)
Esempio n. 6
0
    def _validate(self, clinicalDF, oncotreeLink):
        """
        This function validates the clinical file to make sure it adhere
        to the clinical SOP.

        Args:
            clinicalDF: Merged clinical file with patient and sample
                        information
            oncotreeLink: Link to oncotree

        Returns:
            Error message
        """
        total_error = ""
        warning = ""

        clinicalDF.columns = [col.upper() for col in clinicalDF.columns]
        clinicalDF = clinicalDF.fillna("")

        # oncotree_mapping = process_functions.get_oncotree_codes(oncotreeLink)
        # if oncotree_mapping.empty:
        oncotree_mapping = pd.DataFrame()
        oncotree_mapping_dict = \
            process_functions.get_oncotree_code_mappings(oncotreeLink)
        oncotree_mapping['ONCOTREE_CODE'] = oncotree_mapping_dict.keys()

        sampleType_mapping = \
            process_functions.getGenieMapping(self.syn, "syn7434273")

        ethnicity_mapping = \
            process_functions.getGenieMapping(self.syn, "syn7434242")

        race_mapping = \
            process_functions.getGenieMapping(self.syn, "syn7434236")

        sex_mapping = \
            process_functions.getGenieMapping(self.syn, "syn7434222")

        # CHECK: SAMPLE_ID
        sampleId = 'SAMPLE_ID'
        haveSampleColumn = \
            process_functions.checkColExist(clinicalDF, sampleId)

        if not haveSampleColumn:
            total_error += \
                "Sample Clinical File: Must have SAMPLE_ID column.\n"
        else:
            if sum(clinicalDF[sampleId].duplicated()) > 0:
                total_error += (
                    "Sample Clinical File: No duplicated SAMPLE_ID "
                    "allowed.\nIf there are no duplicated "
                    "SAMPLE_IDs, and both sample and patient files are "
                    "uploaded, then please check to make sure no duplicated "
                    "PATIENT_IDs exist in the patient clinical file.\n")

        # CHECK: PATIENT_ID
        patientId = "PATIENT_ID"
        # #CHECK: PATIENT_ID IN SAMPLE FILE
        havePatientColumn = \
            process_functions.checkColExist(clinicalDF, patientId)

        if not havePatientColumn:
            total_error += \
                "Patient Clinical File: Must have PATIENT_ID column.\n"

        # CHECK: within the sample file that the sample ids match
        # the patient ids
        if haveSampleColumn and havePatientColumn:
            # Make sure sample and patient ids are string cols
            clinicalDF[sampleId] = clinicalDF[sampleId].astype(str)
            clinicalDF[patientId] = clinicalDF[patientId].astype(str)
            if not all([
                    patient in sample for sample, patient in zip(
                        clinicalDF[sampleId], clinicalDF[patientId])
            ]):

                total_error += (
                    "Sample Clinical File: PATIENT_ID's much be contained in "
                    "the SAMPLE_ID's (ex. SAGE-1 <-> SAGE-1-2)\n")
            # #CHECK: All samples must have associated patient data
            # (GENIE requires patient data)
            if not all(clinicalDF[patientId] != ""):
                total_error += (
                    "Patient Clinical File: All samples must have associated "
                    "patient information and no null patient ids allowed. "
                    "These samples are missing patient data: {}\n".format(
                        ", ".join(
                            clinicalDF[sampleId][clinicalDF[patientId] == ""]))
                )
            # CHECK: All patients should have associated sample data
            if not all(clinicalDF[sampleId] != ""):
                # ## MAKE WARNING FOR NOW###
                warning += (
                    "Sample Clinical File: All patients must have associated "
                    "sample information. These patients are missing sample "
                    "data: {}\n".format(", ".join(
                        clinicalDF[patientId][clinicalDF[sampleId] == ""])))

        # CHECK: AGE_AT_SEQ_REPORT
        age = "AGE_AT_SEQ_REPORT"
        haveColumn = process_functions.checkColExist(clinicalDF, age)
        if haveColumn:
            # Deal with HIPAA converted rows from DFCI
            # First for loop can't int(text) because there
            # are instances that have <3435
            age_seq_report_df = \
                clinicalDF[~clinicalDF[age].isin(['Unknown'])]

            age_seq_report_df[age] = \
                remove_greaterthan_lessthan_str(age_seq_report_df[age])

            if not all([
                    process_functions.checkInt(i)
                    for i in age_seq_report_df[age]
            ]):
                total_error += (
                    "Sample Clinical File: Please double check your "
                    "AGE_AT_SEQ_REPORT. It must be an integer or 'Unknown'.\n")
            else:
                age_seq_report_df[age] = age_seq_report_df[age].astype(int)
                median_age = pd.np.median(age_seq_report_df[age])
                if median_age < 100:
                    total_error += (
                        "Sample Clinical File: Please double check your "
                        "AGE_AT_SEQ_REPORT. You may be reporting this value "
                        "in YEARS, please report in DAYS.\n")
        else:
            total_error += \
                "Sample Clinical File: Must have AGE_AT_SEQ_REPORT column.\n"

        # CHECK: ONCOTREE_CODE
        haveColumn = \
            process_functions.checkColExist(clinicalDF, "ONCOTREE_CODE")
        maleOncoCodes = ["TESTIS", "PROSTATE", "PENIS"]
        womenOncoCodes = ["CERVIX", "VULVA", "UTERUS", "OVARY"]
        if haveColumn:
            # Make oncotree codes uppercase (SpCC/SPCC)
            clinicalDF['ONCOTREE_CODE'] = \
                clinicalDF['ONCOTREE_CODE'].astype(str).str.upper()

            oncotree_codes = clinicalDF['ONCOTREE_CODE'][
                clinicalDF['ONCOTREE_CODE'] != "UNKNOWN"]

            if not all(oncotree_codes.isin(oncotree_mapping['ONCOTREE_CODE'])):
                unmapped_oncotrees = oncotree_codes[
                    ~oncotree_codes.isin(oncotree_mapping['ONCOTREE_CODE'])]
                total_error += (
                    "Sample Clinical File: Please double check that all your "
                    "ONCOTREE CODES exist in the mapping. You have {} samples "
                    "that don't map. These are the codes that "
                    "don't map: {}\n".format(len(unmapped_oncotrees), ",".join(
                        set(unmapped_oncotrees))))

            if process_functions.checkColExist(clinicalDF, "SEX") and \
               'oncotree_mapping_dict' in locals() and \
               havePatientColumn and \
               haveSampleColumn:

                wrongCodeSamples = []
                # This is to check if oncotree codes match the sex,
                # returns list of samples that have conflicting codes and sex
                for code, patient, sample in zip(clinicalDF['ONCOTREE_CODE'],
                                                 clinicalDF['PATIENT_ID'],
                                                 clinicalDF['SAMPLE_ID']):

                    if oncotree_mapping_dict.get(code) is not None and \
                       sum(clinicalDF['PATIENT_ID'] == patient) > 0:

                        primaryCode = oncotree_mapping_dict[code][
                            'ONCOTREE_PRIMARY_NODE']

                        sex = clinicalDF['SEX'][clinicalDF['PATIENT_ID'] ==
                                                patient].values[0]
                        sex = float('nan') if sex == '' else float(sex)
                        if oncotree_mapping_dict[code][
                                'ONCOTREE_PRIMARY_NODE'] in maleOncoCodes and \
                           sex != 1.0:

                            wrongCodeSamples.append(sample)
                        if oncotree_mapping_dict[code][
                                'ONCOTREE_PRIMARY_NODE'] in womenOncoCodes and\
                           sex != 2.0:

                            wrongCodeSamples.append(sample)
                if len(wrongCodeSamples) > 0:
                    warning += (
                        "Sample Clinical File: Some SAMPLE_IDs have "
                        "conflicting SEX and ONCOTREE_CODES: {}\n".format(
                            ",".join(wrongCodeSamples)))
        else:
            total_error += \
                "Sample Clinical File: Must have ONCOTREE_CODE column.\n"

        warn, error = process_functions.check_col_and_values(
            clinicalDF,
            "SAMPLE_TYPE",
            sampleType_mapping['CODE'].tolist(),
            "Sample Clinical File",
            required=True)
        total_error += error

        # CHECK: SEQ_ASSAY_ID
        haveColumn = \
            process_functions.checkColExist(clinicalDF, "SEQ_ASSAY_ID")
        if haveColumn:
            if not all([i != "" for i in clinicalDF['SEQ_ASSAY_ID']]):
                total_error += (
                    "Sample Clinical File: Please double check your "
                    "SEQ_ASSAY_ID columns, there are empty rows.\n")
            # must remove empty seq assay ids first
            # Checking if seq assay ids start with the center name
            seqAssayIds = \
                clinicalDF.SEQ_ASSAY_ID[clinicalDF.SEQ_ASSAY_ID != ""]
            allSeqAssays = seqAssayIds.unique()
            notNormalized = []
            not_caps = []
            for seqassay in allSeqAssays:
                # SEQ Ids are all capitalized now, so no need to check
                # for differences in case
                if not seqassay.upper().startswith(self.center):
                    not_caps.append(seqassay)
            if len(not_caps) > 0:
                total_error += ("Sample Clinical File: Please make sure your "
                                "SEQ_ASSAY_IDs start with your center "
                                "abbreviation: {}.\n".format(
                                    ", ".join(not_caps)))
        else:
            total_error += \
                "Sample Clinical File: Must have SEQ_ASSAY_ID column.\n"

        haveColumn = process_functions.checkColExist(clinicalDF, "SEQ_DATE")
        seq_date_error = (
            "Sample Clinical File: SEQ_DATE must be one of five values- "
            "For Jan-March: use Jan-YEAR. "
            "For Apr-June: use Apr-YEAR. "
            "For July-Sep: use Jul-YEAR. "
            "For Oct-Dec: use Oct-YEAR. (ie. Apr-2017) "
            "For values that don't have SEQ_DATES that "
            "you want released use 'release'.\n")

        if haveColumn:
            clinicalDF['SEQ_DATE'] = [
                i.title() for i in clinicalDF['SEQ_DATE'].astype(str)
            ]

            seqDate = clinicalDF['SEQ_DATE'][
                clinicalDF['SEQ_DATE'] != 'Release']
            if sum(clinicalDF['SEQ_DATE'] == '') > 0:
                total_error += (
                    "Sample Clinical File: Samples without SEQ_DATEs will "
                    "NOT be released.\n")
            try:
                if not seqDate.empty:
                    dates = seqDate.apply(
                        lambda date: datetime.datetime.strptime(date, '%b-%Y'))
                    # REMOVE JUN LATER
                    if not all([
                            i.startswith(("Jul", "Jan", "Oct", "Apr"))
                            for i in seqDate
                    ]):
                        total_error += seq_date_error
            except ValueError:
                total_error += seq_date_error
        else:
            total_error += "Sample Clinical File: Must have SEQ_DATE column.\n"

        # CHECK: BIRTH_YEAR
        birth_year = "BIRTH_YEAR"
        haveColumn = process_functions.checkColExist(clinicalDF, birth_year)
        if haveColumn:
            birth_year_df = \
                clinicalDF[~clinicalDF[birth_year].isin(['Unknown'])]
            # Deal with HIPAA converted rows from DFCI
            # First for loop can't int(text) because there are
            # instances that have <YYYY
            birth_year_df[birth_year] = \
                remove_greaterthan_lessthan_str(birth_year_df[birth_year])

            try:
                years = birth_year_df[birth_year].apply(
                    lambda x: datetime.datetime.strptime(str(int(
                        x)), '%Y').year > datetime.datetime.utcnow().year)

                assert not years.any()
            except Exception:
                total_error += (
                    "Patient Clinical File: Please double check your "
                    "BIRTH_YEAR column, it must be an integer in YYYY format "
                    "> {year} or 'Unknown'.\n".format(
                        year=datetime.datetime.utcnow().year))
        else:
            total_error += \
                "Patient Clinical File: Must have BIRTH_YEAR column.\n"

        # CHECK: VITAL_STATUS
        # YEAR DEATH
        haveColumn = process_functions.checkColExist(clinicalDF, "YEAR_DEATH")
        if haveColumn:
            notNullYears = clinicalDF.YEAR_DEATH[~clinicalDF.YEAR_DEATH.isin(
                ['Unknown', 'Not Collected', 'Not Applicable'])]
            try:
                notNullYears.apply(
                    lambda x: datetime.datetime.strptime(str(int(x)), '%Y'))
            except Exception:
                total_error += (
                    "Patient Clinical File: Please double check your "
                    "YEAR_DEATH column, it must be an integer in YYYY format, "
                    "'Unknown', 'Not Applicable' or 'Not Collected'.\n")
        else:
            total_error += \
                "Patient Clinical File: Must have YEAR_DEATH column.\n"

        # YEAR CONTACT
        haveColumn = process_functions.checkColExist(clinicalDF,
                                                     "YEAR_CONTACT")
        if haveColumn:
            notNullYears = clinicalDF.YEAR_CONTACT[
                ~clinicalDF.YEAR_CONTACT.isin(['Unknown', 'Not Collected'])]
            try:
                notNullYears.apply(
                    lambda x: datetime.datetime.strptime(str(int(x)), '%Y'))
            except Exception:
                total_error += (
                    "Patient Clinical File: Please double check your "
                    "YEAR_CONTACT column, it must be an integer in YYYY "
                    "format, 'Unknown' or 'Not Collected'.\n")
        else:
            total_error += \
                "Patient Clinical File: Must have YEAR_CONTACT column.\n"

        # INT CONTACT
        haveColumn = process_functions.checkColExist(clinicalDF, "INT_CONTACT")
        if haveColumn:
            if not all([
                    process_functions.checkInt(i)
                    for i in clinicalDF.INT_CONTACT if i not in
                ['>32485', '<6570', 'Unknown', 'Not Collected']
            ]):

                total_error += (
                    "Patient Clinical File: Please double check your "
                    "INT_CONTACT column, it must be an integer, '>32485', "
                    "'<6570', 'Unknown' or 'Not Collected'.\n")
        else:
            total_error += \
                "Patient Clinical File: Must have INT_CONTACT column.\n"

        # INT DOD
        haveColumn = process_functions.checkColExist(clinicalDF, "INT_DOD")
        if haveColumn:
            if not all([
                    process_functions.checkInt(i)
                    for i in clinicalDF.INT_DOD if i not in [
                        '>32485', '<6570', 'Unknown', 'Not Collected',
                        'Not Applicable'
                    ]
            ]):

                total_error += (
                    "Patient Clinical File: Please double check your INT_DOD "
                    "column, it must be an integer, '>32485', '<6570', "
                    "'Unknown', 'Not Collected' or 'Not Applicable'.\n")
        else:
            total_error += \
                "Patient Clinical File: Must have INT_DOD column.\n"

        haveColumn = process_functions.checkColExist(clinicalDF, "DEAD")
        if haveColumn:
            # Need to have check_bool function
            if not all([
                    str(i).upper() in ['TRUE', 'FALSE']
                    for i in clinicalDF.DEAD
                    if i not in ['Unknown', 'Not Collected']
            ]):
                total_error += (
                    "Patient Clinical File: Please double check your "
                    "DEAD column, it must be True, False, 'Unknown' or "
                    "'Not Collected'.\n")
        else:
            total_error += \
                "Patient Clinical File: Must have DEAD column.\n"

        # CHECK: PRIMARY_RACE
        warn, error = process_functions.check_col_and_values(
            clinicalDF, "PRIMARY_RACE", race_mapping['CODE'].tolist(),
            "Patient Clinical File")
        warning += warn
        total_error += error

        # CHECK: SECONDARY_RACE
        warn, error = process_functions.check_col_and_values(
            clinicalDF, "SECONDARY_RACE", race_mapping['CODE'].tolist(),
            "Patient Clinical File")
        warning += warn
        total_error += error

        # CHECK: TERTIARY_RACE
        warn, error = process_functions.check_col_and_values(
            clinicalDF, "TERTIARY_RACE", race_mapping['CODE'].tolist(),
            "Patient Clinical File")
        warning += warn
        total_error += error

        # CHECK: SEX
        warn, error = process_functions.check_col_and_values(
            clinicalDF,
            "SEX",
            sex_mapping['CODE'].tolist(),
            "Patient Clinical File",
            required=True)
        warning += warn
        total_error += error

        # CHECK: ETHNICITY
        warn, error = process_functions.check_col_and_values(
            clinicalDF, "ETHNICITY", ethnicity_mapping['CODE'].tolist(),
            "Patient Clinical File")
        warning += warn
        total_error += error

        return (total_error, warning)
Esempio n. 7
0
    def _validate(self, assay_info_df, project_id):
        """
        Validates the values of assay information file

        Args:
            assay_info_df: assay information dataframe

        Returns:
            tuple: error and warning
        """

        total_error = ""
        warning = ""

        if process_functions.checkColExist(assay_info_df, "SEQ_ASSAY_ID"):
            all_seq_assays = (assay_info_df.SEQ_ASSAY_ID.replace(
                {
                    "_": "-"
                }, regex=True).str.upper().unique())
            if not all(
                [assay.startswith(self.center) for assay in all_seq_assays]):
                total_error += (
                    "Assay_information.yaml: Please make sure all your "
                    "SEQ_ASSAY_IDs start with your center abbreviation.\n")
            db_to_syn_map_df = process_functions.get_synid_database_mappingdf(
                self.syn, project_id)
            sample_synid = process_functions.getDatabaseSynId(
                self.syn, "sample", databaseToSynIdMappingDf=db_to_syn_map_df)
            uniq_seq_df = process_functions.get_syntabledf(
                self.syn,
                f"select distinct(SEQ_ASSAY_ID) as seq from {sample_synid} "
                f"where CENTER = '{self.center}'",
            )
            # These are all the SEQ_ASSAY_IDs that are in the clinical database
            # but not in the assay_information file
            missing_seqs = uniq_seq_df["seq"][
                ~uniq_seq_df["seq"].replace({
                    "_": "-"
                }, regex=True).str.upper().isin(all_seq_assays)]
            missing_seqs_str = ", ".join(missing_seqs)
            if missing_seqs.to_list():
                total_error += (
                    "Assay_information.yaml: You are missing SEQ_ASSAY_IDs: "
                    f"{missing_seqs_str}\n")

        else:
            total_error += "Assay_information.yaml: Must have SEQ_ASSAY_ID column.\n"

        read_group_dict = process_functions.get_gdc_data_dictionary(
            "read_group")
        read_group_headers = read_group_dict["properties"]
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "is_paired_end",
            [True, False],
            filename="Assay_information.yaml",
            required=True,
        )
        warning += warn
        total_error += error

        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "library_selection",
            read_group_headers["library_selection"]["enum"],
            filename="Assay_information.yaml",
            required=True,
        )
        warning += warn
        total_error += error

        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "library_strategy",
            read_group_headers["library_strategy"]["enum"],
            filename="Assay_information.yaml",
            required=True,
        )
        warning += warn
        total_error += error

        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "platform",
            read_group_headers["platform"]["enum"],
            filename="Assay_information.yaml",
            required=True,
        )
        warning += warn
        total_error += error

        instrument_model = read_group_headers["instrument_model"]["enum"]
        instrument_model.extend(["Illumina NovaSeq 6000", None])
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "instrument_model",
            instrument_model,
            filename="Assay_information.yaml",
            required=True,
        )
        warning += warn
        total_error += error

        # target_capture_kit = read_group_headers['target_capture_kit']['enum']
        # warn, error = process_functions.check_col_and_values(
        #     assay_info_df,
        #     'target_capture_kit',
        #     target_capture_kit,
        #     filename="Assay_information.yaml",
        #     required=True)
        # warning += warn
        # total_error += error

        if not process_functions.checkColExist(assay_info_df,
                                               "target_capture_kit"):
            total_error += ("Assay_information.yaml: "
                            "Must have target_capture_kit column.\n")

        variant_classes = [
            "Splice_Site",
            "Nonsense_Mutation",
            "Frame_Shift_Del",
            "Frame_Shift_Ins",
            "Nonstop_Mutation",
            "Translation_Start_Site",
            "In_Frame_Ins",
            "In_Frame_Del",
            "Missense_Mutation",
            "Intron",
            "Splice_Region",
            "Silent",
            "RNA",
            "5'UTR",
            "3'UTR",
            "IGR",
            "5'Flank",
            "3'Flank",
            None,
        ]
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "variant_classifications",
            variant_classes,
            filename="Assay_information.yaml",
            na_allowed=True,
            sep=";",
        )
        warning += warn
        total_error += error

        if process_functions.checkColExist(assay_info_df, "read_length"):
            if not all([
                    process_functions.checkInt(i)
                    for i in assay_info_df["read_length"]
                    if i is not None and not pd.isnull(i)
            ]):
                total_error += ("Assay_information.yaml: "
                                "Please double check your read_length.  "
                                "It must be an integer or null.\n")
        else:
            total_error += "Assay_information.yaml: " "Must have read_length column.\n"

        if process_functions.checkColExist(assay_info_df, "number_of_genes"):
            if not all([
                    process_functions.checkInt(i)
                    for i in assay_info_df["number_of_genes"]
            ]):
                total_error += ("Assay_information.yaml: "
                                "Please double check your number_of_genes. "
                                "It must be an integer.\n")
        else:
            total_error += ("Assay_information.yaml: "
                            "Must have number_of_genes column.\n")

        if process_functions.checkColExist(assay_info_df, "gene_padding"):
            if not all([
                    process_functions.checkInt(i)
                    for i in assay_info_df["gene_padding"]
                    if i is not None and not pd.isnull(i)
            ]):
                total_error += ("Assay_information.yaml: "
                                "Please double check your gene_padding. "
                                "It must be an integer or blank.\n")
        else:
            warning += ("Assay_information.yaml: "
                        "gene_padding is by default 10 if not specified.\n")

        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "calling_strategy",
            ["tumor_only", "tumor_normal", "plasma_normal"],
            filename="Assay_information.yaml",
            required=True,
        )
        warning += warn
        total_error += error

        if process_functions.checkColExist(assay_info_df,
                                           "specimen_tumor_cellularity"):
            if not all([
                    i.startswith(">") and i.endswith("%")
                    for i in assay_info_df["specimen_tumor_cellularity"]
            ]):
                total_error += (
                    "Assay_information.yaml: "
                    "Please double check your specimen_tumor_cellularity. "
                    "It must in this format >(num)%. ie. >10%\n")
        else:
            total_error += ("Assay_information.yaml: "
                            "Must have specimen_tumor_cellularity column.\n")

        alteration_types = [
            "snv",
            "small_indels",
            "gene_level_cna",
            "intragenic_cna",
            "structural_variants",
        ]
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "alteration_types",
            alteration_types,
            filename="Assay_information.yaml",
            required=True,
            sep=";",
        )
        warning += warn
        total_error += error

        preservation_technique = ["FFPE", "fresh_frozen", "NA"]
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "preservation_technique",
            preservation_technique,
            filename="Assay_information.yaml",
            required=True,
            sep=";",
        )
        warning += warn
        total_error += error

        coverage = ["hotspot_regions", "coding_exons", "introns", "promoters"]
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "coverage",
            coverage,
            filename="Assay_information.yaml",
            required=True,
            sep=";",
        )
        warning += warn
        total_error += error

        return total_error, warning