def _validate(self, assay_info_df): ''' Validates the values of assay information file Args: assay_info_df: assay information dataframe Returns: tuple: error and warning ''' total_error = "" warning = "" if process_functions.checkColExist(assay_info_df, "SEQ_ASSAY_ID"): all_seq_assays = assay_info_df.SEQ_ASSAY_ID.unique() if not all( [assay.startswith(self.center) for assay in all_seq_assays]): total_error += \ "Assay_information.yaml: Please make sure your all your" +\ " SEQ_ASSAY_IDs start with your center abbreviation.\n" else: total_error += \ "Assay_information.yaml: Must have SEQ_ASSAY_ID column.\n" read_group_dict = process_functions.get_gdc_data_dictionary( "read_group") read_group_headers = read_group_dict['properties'] warn, error = process_functions.check_col_and_values( assay_info_df, 'is_paired_end', [True, False], filename="Assay_information.yaml", required=True) warning += warn total_error += error warn, error = process_functions.check_col_and_values( assay_info_df, 'library_selection', read_group_headers['library_selection']['enum'], filename="Assay_information.yaml", required=True) warning += warn total_error += error warn, error = process_functions.check_col_and_values( assay_info_df, 'library_strategy', read_group_headers['library_strategy']['enum'], filename="Assay_information.yaml", required=True) warning += warn total_error += error warn, error = process_functions.check_col_and_values( assay_info_df, 'platform', read_group_headers['platform']['enum'], filename="Assay_information.yaml", required=True) warning += warn total_error += error instrument_model = read_group_headers['instrument_model']['enum'] instrument_model.append(None) warn, error = process_functions.check_col_and_values( assay_info_df, 'instrument_model', instrument_model, filename="Assay_information.yaml", required=True) warning += warn total_error += error variant_classes = \ ['Splice_Site', 'Nonsense_Mutation', 'Frame_Shift_Del', 'Frame_Shift_Ins', 'Nonstop_Mutation', 'Translation_Start_Site', 'In_Frame_Ins', 'In_Frame_Del', 'Missense_Mutation', 'Intron', 'Splice_Region', 'Silent', 'RNA', "5'UTR", "3'UTR", 'IGR', "5'Flank", "3'Flank", None] warn, error = process_functions.check_col_and_values( assay_info_df, 'variant_classifications', variant_classes, filename="Assay_information.yaml", na_allowed=True) warning += warn total_error += error # if not process_functions.checkColExist( # assay_info_df, "target_capture_kit"): # total_error += ("Assay_information.yaml: " # "Must have target_capture_kit column.\n") if process_functions.checkColExist(assay_info_df, "read_length"): if not all([ process_functions.checkInt(i) for i in assay_info_df["read_length"] if i is not None and not pd.isnull(i) ]): total_error += \ ("Assay_information.yaml: " "Please double check your read_length. " "It must be an integer or null.\n") else: total_error += \ ("Assay_information.yaml: " "Must have read_length column.\n") if process_functions.checkColExist(assay_info_df, "number_of_genes"): if not all([ process_functions.checkInt(i) for i in assay_info_df["number_of_genes"] ]): total_error += \ ("Assay_information.yaml: " "Please double check your number_of_genes. " "It must be an integer.\n") else: total_error += \ ("Assay_information.yaml: " "Must have number_of_genes column.\n") if process_functions.checkColExist(assay_info_df, "gene_padding"): if not all([ process_functions.checkInt(i) for i in assay_info_df["gene_padding"] if i is not None and not pd.isnull(i) ]): total_error += \ ("Assay_information.yaml: " "Please double check your gene_padding. " "It must be an integer or blank.\n") else: warning += \ ("Assay_information.yaml: " "gene_padding is by default 10 if not specified.\n") return (total_error, warning)
def _validate(self, clinicaldf, oncotree_link): """ This function validates the clinical file to make sure it adhere to the clinical SOP. Args: clinicalDF: Merged clinical file with patient and sample information oncotree_link: Link to oncotree Returns: Error message """ total_error = StringIO() warning = StringIO() clinicaldf.columns = [col.upper() for col in clinicaldf.columns] # CHECK: for empty rows empty_rows = clinicaldf.isnull().values.all(axis=1) if empty_rows.any(): total_error.write("Clinical file(s): No empty rows allowed.\n") # Remove completely empty rows to speed up processing clinicaldf = clinicaldf[~empty_rows] clinicaldf = clinicaldf.fillna("") oncotree_mapping_dict = process_functions.get_oncotree_code_mappings( oncotree_link ) oncotree_mapping = pd.DataFrame( {"ONCOTREE_CODE": list(oncotree_mapping_dict.keys())} ) sampletype_mapping = process_functions.getGenieMapping(self.syn, "syn7434273") ethnicity_mapping = process_functions.getGenieMapping(self.syn, "syn7434242") race_mapping = process_functions.getGenieMapping(self.syn, "syn7434236") sex_mapping = process_functions.getGenieMapping(self.syn, "syn7434222") # CHECK: SAMPLE_ID sample_id = "SAMPLE_ID" haveSampleColumn = process_functions.checkColExist(clinicaldf, sample_id) if not haveSampleColumn: total_error.write("Sample Clinical File: Must have SAMPLE_ID column.\n") else: if sum(clinicaldf[sample_id].duplicated()) > 0: total_error.write( "Sample Clinical File: No duplicated SAMPLE_ID " "allowed.\nIf there are no duplicated " "SAMPLE_IDs, and both sample and patient files are " "uploaded, then please check to make sure no duplicated " "PATIENT_IDs exist in the patient clinical file.\n" ) # CHECK: PATIENT_ID patientId = "PATIENT_ID" # #CHECK: PATIENT_ID IN SAMPLE FILE havePatientColumn = process_functions.checkColExist(clinicaldf, patientId) if not havePatientColumn: total_error.write("Patient Clinical File: Must have PATIENT_ID column.\n") # CHECK: within the sample file that the sample ids match # the patient ids if haveSampleColumn and havePatientColumn: # Make sure sample and patient ids are string cols clinicaldf[sample_id] = clinicaldf[sample_id].astype(str) clinicaldf[patientId] = clinicaldf[patientId].astype(str) if not all( [ patient in sample for sample, patient in zip( clinicaldf[sample_id], clinicaldf[patientId] ) ] ): total_error.write( "Sample Clinical File: PATIENT_ID's much be contained in " "the SAMPLE_ID's (ex. SAGE-1 <-> SAGE-1-2)\n" ) # #CHECK: All samples must have associated patient data # (GENIE requires patient data) if not all(clinicaldf[patientId] != ""): total_error.write( "Patient Clinical File: All samples must have associated " "patient information and no null patient ids allowed. " "These samples are missing patient data: {}\n".format( ", ".join( clinicaldf[sample_id][clinicaldf[patientId] == ""].unique() ) ) ) # CHECK: All patients should have associated sample data if not all(clinicaldf[sample_id] != ""): # ## MAKE WARNING FOR NOW### warning.write( "Sample Clinical File: All patients must have associated " "sample information. These patients are missing sample " "data: {}\n".format( ", ".join( clinicaldf[patientId][clinicaldf[sample_id] == ""].unique() ) ) ) # CHECK: AGE_AT_SEQ_REPORT age = "AGE_AT_SEQ_REPORT" haveColumn = process_functions.checkColExist(clinicaldf, age) if haveColumn: # Deal with HIPAA converted rows from DFCI # First for loop can't int(text) because there # are instances that have <3435 age_seq_report_df = clinicaldf[ ~clinicaldf[age].isin(["Unknown", ">32485", "<6570"]) ] # age_seq_report_df[age] = \ # remove_greaterthan_lessthan_str(age_seq_report_df[age]) if not all([process_functions.checkInt(i) for i in age_seq_report_df[age]]): total_error.write( "Sample Clinical File: Please double check your " "AGE_AT_SEQ_REPORT. It must be an integer, 'Unknown', " "'>32485', '<6570'.\n" ) else: age_seq_report_df[age] = age_seq_report_df[age].astype(int) median_age = age_seq_report_df[age].median() if median_age < 100: total_error.write( "Sample Clinical File: Please double check your " "AGE_AT_SEQ_REPORT. You may be reporting this value " "in YEARS, please report in DAYS.\n" ) else: total_error.write( "Sample Clinical File: Must have AGE_AT_SEQ_REPORT column.\n" ) # CHECK: ONCOTREE_CODE haveColumn = process_functions.checkColExist(clinicaldf, "ONCOTREE_CODE") maleOncoCodes = ["TESTIS", "PROSTATE", "PENIS"] womenOncoCodes = ["CERVIX", "VULVA", "UTERUS", "OVARY"] if haveColumn: # Make oncotree codes uppercase (SpCC/SPCC) clinicaldf["ONCOTREE_CODE"] = ( clinicaldf["ONCOTREE_CODE"].astype(str).str.upper() ) oncotree_codes = clinicaldf["ONCOTREE_CODE"][ clinicaldf["ONCOTREE_CODE"] != "UNKNOWN" ] if not all(oncotree_codes.isin(oncotree_mapping["ONCOTREE_CODE"])): unmapped_oncotrees = oncotree_codes[ ~oncotree_codes.isin(oncotree_mapping["ONCOTREE_CODE"]) ] total_error.write( "Sample Clinical File: Please double check that all your " "ONCOTREE CODES exist in the mapping. You have {} samples " "that don't map. These are the codes that " "don't map: {}\n".format( len(unmapped_oncotrees), ",".join(set(unmapped_oncotrees)), ) ) # Should add the SEX mismatch into the dashboard file if ( process_functions.checkColExist(clinicaldf, "SEX") and "oncotree_mapping_dict" in locals() and havePatientColumn and haveSampleColumn ): wrongCodeSamples = [] # This is to check if oncotree codes match the sex, # returns list of samples that have conflicting codes and sex for code, patient, sample in zip( clinicaldf["ONCOTREE_CODE"], clinicaldf["PATIENT_ID"], clinicaldf["SAMPLE_ID"], ): if ( oncotree_mapping_dict.get(code) is not None and sum(clinicaldf["PATIENT_ID"] == patient) > 0 ): primaryCode = oncotree_mapping_dict[code][ "ONCOTREE_PRIMARY_NODE" ] sex = clinicaldf["SEX"][ clinicaldf["PATIENT_ID"] == patient ].values[0] sex = float("nan") if sex == "" else float(sex) if ( oncotree_mapping_dict[code]["ONCOTREE_PRIMARY_NODE"] in maleOncoCodes and sex != 1.0 ): wrongCodeSamples.append(sample) if ( oncotree_mapping_dict[code]["ONCOTREE_PRIMARY_NODE"] in womenOncoCodes and sex != 2.0 ): wrongCodeSamples.append(sample) if len(wrongCodeSamples) > 0: warning.write( "Sample Clinical File: Some SAMPLE_IDs have " "conflicting SEX and ONCOTREE_CODES: {}\n".format( ",".join(wrongCodeSamples) ) ) else: total_error.write("Sample Clinical File: Must have ONCOTREE_CODE column.\n") warn, error = process_functions.check_col_and_values( clinicaldf, "SAMPLE_TYPE", sampletype_mapping["CODE"].tolist(), "Sample Clinical File", required=True, ) total_error.write(error) # CHECK: SEQ_ASSAY_ID haveColumn = process_functions.checkColExist(clinicaldf, "SEQ_ASSAY_ID") if haveColumn: if not all([i != "" for i in clinicaldf["SEQ_ASSAY_ID"]]): total_error.write( "Sample Clinical File: Please double check your " "SEQ_ASSAY_ID columns, there are empty rows.\n" ) # must remove empty seq assay ids first # Checking if seq assay ids start with the center name empty_seq_idx = clinicaldf.SEQ_ASSAY_ID != "" seqassay_ids = clinicaldf.SEQ_ASSAY_ID[empty_seq_idx] uniq_seqassay_ids = seqassay_ids.unique() invalid_seqassay = [] for seqassay in uniq_seqassay_ids: # SEQ Ids are all capitalized now, so no need to check # for differences in case if not seqassay.upper().startswith(self.center): invalid_seqassay.append(seqassay) if invalid_seqassay: total_error.write( "Sample Clinical File: Please make sure your " "SEQ_ASSAY_IDs start with your center " "abbreviation: {}.\n".format(", ".join(invalid_seqassay)) ) else: total_error.write("Sample Clinical File: Must have SEQ_ASSAY_ID column.\n") haveColumn = process_functions.checkColExist(clinicaldf, "SEQ_DATE") seq_date_error = ( "Sample Clinical File: SEQ_DATE must be one of five values- " "For Jan-March: use Jan-YEAR. " "For Apr-June: use Apr-YEAR. " "For July-Sep: use Jul-YEAR. " "For Oct-Dec: use Oct-YEAR. (ie. Apr-2017) " "For values that don't have SEQ_DATES that " "you want released use 'release'.\n" ) if haveColumn: clinicaldf["SEQ_DATE"] = [ i.title() for i in clinicaldf["SEQ_DATE"].astype(str) ] seqdate = clinicaldf["SEQ_DATE"][clinicaldf["SEQ_DATE"] != "Release"] if sum(clinicaldf["SEQ_DATE"] == "") > 0: total_error.write( "Sample Clinical File: Samples without SEQ_DATEs will " "NOT be released.\n" ) try: if not seqdate.empty: seqdate.apply( lambda date: datetime.datetime.strptime(date, "%b-%Y") ) if not seqdate.str.startswith(("Jan", "Apr", "Jul", "Oct")).all(): total_error.write(seq_date_error) except ValueError: total_error.write(seq_date_error) else: total_error.write("Sample Clinical File: Must have SEQ_DATE column.\n") # CHECK: BIRTH_YEAR error = _check_year( clinicaldf=clinicaldf, year_col="BIRTH_YEAR", filename="Patient Clinical File", allowed_string_values=["Unknown", ">89", "<18"], ) total_error.write(error) # CHECK: YEAR DEATH error = _check_year( clinicaldf=clinicaldf, year_col="YEAR_DEATH", filename="Patient Clinical File", allowed_string_values=[ "Unknown", "Not Collected", "Not Applicable", "Not Released", ">89", "<18", ], ) total_error.write(error) # CHECK: YEAR CONTACT error = _check_year( clinicaldf=clinicaldf, year_col="YEAR_CONTACT", filename="Patient Clinical File", allowed_string_values=[ "Unknown", "Not Collected", "Not Released", ">89", "<18", ], ) total_error.write(error) # CHECK: INT CONTACT haveColumn = process_functions.checkColExist(clinicaldf, "INT_CONTACT") if haveColumn: if not all( [ process_functions.checkInt(i) for i in clinicaldf.INT_CONTACT if i not in [ ">32485", "<6570", "Unknown", "Not Collected", "Not Released", ] ] ): total_error.write( "Patient Clinical File: Please double check your " "INT_CONTACT column, it must be an integer, '>32485', " "'<6570', 'Unknown', 'Not Released' or 'Not Collected'.\n" ) else: total_error.write("Patient Clinical File: Must have INT_CONTACT column.\n") # INT DOD haveColumn = process_functions.checkColExist(clinicaldf, "INT_DOD") if haveColumn: if not all( [ process_functions.checkInt(i) for i in clinicaldf.INT_DOD if i not in [ ">32485", "<6570", "Unknown", "Not Collected", "Not Applicable", "Not Released", ] ] ): total_error.write( "Patient Clinical File: Please double check your INT_DOD " "column, it must be an integer, '>32485', '<6570', " "'Unknown', 'Not Collected', 'Not Released' or " "'Not Applicable'.\n" ) else: total_error.write("Patient Clinical File: Must have INT_DOD column.\n") haveColumn = process_functions.checkColExist(clinicaldf, "DEAD") if haveColumn: # Need to have check_bool function if not all( [ str(i).upper() in ["TRUE", "FALSE"] for i in clinicaldf.DEAD if i not in ["Unknown", "Not Collected", "Not Released"] ] ): total_error.write( "Patient Clinical File: Please double check your " "DEAD column, it must be True, False, 'Unknown', " "'Not Released' or 'Not Collected'.\n" ) else: total_error.write("Patient Clinical File: Must have DEAD column.\n") # CHECK: contact vital status value consistency contact_error = _check_int_year_consistency( clinicaldf=clinicaldf, cols=["YEAR_CONTACT", "INT_CONTACT"], string_vals=["Not Collected", "Unknown", "Not Released"], ) total_error.write(contact_error) # CHECK: death vital status value consistency death_error = _check_int_year_consistency( clinicaldf=clinicaldf, cols=["YEAR_DEATH", "INT_DOD"], string_vals=[ "Not Collected", "Unknown", "Not Applicable", "Not Released", ], ) total_error.write(death_error) death_error = _check_int_dead_consistency(clinicaldf=clinicaldf) total_error.write(death_error) # CHECK: SAMPLE_CLASS is optional attribute have_column = process_functions.checkColExist(clinicaldf, "SAMPLE_CLASS") if have_column: sample_class_vals = pd.Series(clinicaldf["SAMPLE_CLASS"].unique().tolist()) if not sample_class_vals.isin(["Tumor", "cfDNA"]).all(): total_error.write( "Sample Clinical File: SAMPLE_CLASS column must " "be 'Tumor', or 'cfDNA'\n" ) # CHECK: PRIMARY_RACE warn, error = process_functions.check_col_and_values( clinicaldf, "PRIMARY_RACE", race_mapping["CODE"].tolist(), "Patient Clinical File", ) warning.write(warn) total_error.write(error) # CHECK: SECONDARY_RACE warn, error = process_functions.check_col_and_values( clinicaldf, "SECONDARY_RACE", race_mapping["CODE"].tolist(), "Patient Clinical File", ) warning.write(warn) total_error.write(error) # CHECK: TERTIARY_RACE warn, error = process_functions.check_col_and_values( clinicaldf, "TERTIARY_RACE", race_mapping["CODE"].tolist(), "Patient Clinical File", ) warning.write(warn) total_error.write(error) # CHECK: SEX warn, error = process_functions.check_col_and_values( clinicaldf, "SEX", sex_mapping["CODE"].tolist(), "Patient Clinical File", required=True, ) warning.write(warn) total_error.write(error) # CHECK: ETHNICITY warn, error = process_functions.check_col_and_values( clinicaldf, "ETHNICITY", ethnicity_mapping["CODE"].tolist(), "Patient Clinical File", ) warning.write(warn) total_error.write(error) return total_error.getvalue(), warning.getvalue()
def _validate(self, clinicalDF, oncotreeLink): """ This function validates the clinical file to make sure it adhere to the clinical SOP. Args: clinicalDF: Merged clinical file with patient and sample information oncotreeLink: Link to oncotree Returns: Error message """ total_error = "" warning = "" clinicalDF.columns = [col.upper() for col in clinicalDF.columns] clinicalDF = clinicalDF.fillna("") # oncotree_mapping = process_functions.get_oncotree_codes(oncotreeLink) # if oncotree_mapping.empty: oncotree_mapping = pd.DataFrame() oncotree_mapping_dict = \ process_functions.get_oncotree_code_mappings(oncotreeLink) oncotree_mapping['ONCOTREE_CODE'] = oncotree_mapping_dict.keys() sampleType_mapping = \ process_functions.getGenieMapping(self.syn, "syn7434273") ethnicity_mapping = \ process_functions.getGenieMapping(self.syn, "syn7434242") race_mapping = \ process_functions.getGenieMapping(self.syn, "syn7434236") sex_mapping = \ process_functions.getGenieMapping(self.syn, "syn7434222") # CHECK: SAMPLE_ID sampleId = 'SAMPLE_ID' haveSampleColumn = \ process_functions.checkColExist(clinicalDF, sampleId) if not haveSampleColumn: total_error += \ "Sample Clinical File: Must have SAMPLE_ID column.\n" else: if sum(clinicalDF[sampleId].duplicated()) > 0: total_error += ( "Sample Clinical File: No duplicated SAMPLE_ID " "allowed.\nIf there are no duplicated " "SAMPLE_IDs, and both sample and patient files are " "uploaded, then please check to make sure no duplicated " "PATIENT_IDs exist in the patient clinical file.\n") # CHECK: PATIENT_ID patientId = "PATIENT_ID" # #CHECK: PATIENT_ID IN SAMPLE FILE havePatientColumn = \ process_functions.checkColExist(clinicalDF, patientId) if not havePatientColumn: total_error += \ "Patient Clinical File: Must have PATIENT_ID column.\n" # CHECK: within the sample file that the sample ids match # the patient ids if haveSampleColumn and havePatientColumn: # Make sure sample and patient ids are string cols clinicalDF[sampleId] = clinicalDF[sampleId].astype(str) clinicalDF[patientId] = clinicalDF[patientId].astype(str) if not all([ patient in sample for sample, patient in zip( clinicalDF[sampleId], clinicalDF[patientId]) ]): total_error += ( "Sample Clinical File: PATIENT_ID's much be contained in " "the SAMPLE_ID's (ex. SAGE-1 <-> SAGE-1-2)\n") # #CHECK: All samples must have associated patient data # (GENIE requires patient data) if not all(clinicalDF[patientId] != ""): total_error += ( "Patient Clinical File: All samples must have associated " "patient information and no null patient ids allowed. " "These samples are missing patient data: {}\n".format( ", ".join( clinicalDF[sampleId][clinicalDF[patientId] == ""])) ) # CHECK: All patients should have associated sample data if not all(clinicalDF[sampleId] != ""): # ## MAKE WARNING FOR NOW### warning += ( "Sample Clinical File: All patients must have associated " "sample information. These patients are missing sample " "data: {}\n".format(", ".join( clinicalDF[patientId][clinicalDF[sampleId] == ""]))) # CHECK: AGE_AT_SEQ_REPORT age = "AGE_AT_SEQ_REPORT" haveColumn = process_functions.checkColExist(clinicalDF, age) if haveColumn: # Deal with HIPAA converted rows from DFCI # First for loop can't int(text) because there # are instances that have <3435 age_seq_report_df = \ clinicalDF[~clinicalDF[age].isin(['Unknown'])] age_seq_report_df[age] = \ remove_greaterthan_lessthan_str(age_seq_report_df[age]) if not all([ process_functions.checkInt(i) for i in age_seq_report_df[age] ]): total_error += ( "Sample Clinical File: Please double check your " "AGE_AT_SEQ_REPORT. It must be an integer or 'Unknown'.\n") else: age_seq_report_df[age] = age_seq_report_df[age].astype(int) median_age = pd.np.median(age_seq_report_df[age]) if median_age < 100: total_error += ( "Sample Clinical File: Please double check your " "AGE_AT_SEQ_REPORT. You may be reporting this value " "in YEARS, please report in DAYS.\n") else: total_error += \ "Sample Clinical File: Must have AGE_AT_SEQ_REPORT column.\n" # CHECK: ONCOTREE_CODE haveColumn = \ process_functions.checkColExist(clinicalDF, "ONCOTREE_CODE") maleOncoCodes = ["TESTIS", "PROSTATE", "PENIS"] womenOncoCodes = ["CERVIX", "VULVA", "UTERUS", "OVARY"] if haveColumn: # Make oncotree codes uppercase (SpCC/SPCC) clinicalDF['ONCOTREE_CODE'] = \ clinicalDF['ONCOTREE_CODE'].astype(str).str.upper() oncotree_codes = clinicalDF['ONCOTREE_CODE'][ clinicalDF['ONCOTREE_CODE'] != "UNKNOWN"] if not all(oncotree_codes.isin(oncotree_mapping['ONCOTREE_CODE'])): unmapped_oncotrees = oncotree_codes[ ~oncotree_codes.isin(oncotree_mapping['ONCOTREE_CODE'])] total_error += ( "Sample Clinical File: Please double check that all your " "ONCOTREE CODES exist in the mapping. You have {} samples " "that don't map. These are the codes that " "don't map: {}\n".format(len(unmapped_oncotrees), ",".join( set(unmapped_oncotrees)))) if process_functions.checkColExist(clinicalDF, "SEX") and \ 'oncotree_mapping_dict' in locals() and \ havePatientColumn and \ haveSampleColumn: wrongCodeSamples = [] # This is to check if oncotree codes match the sex, # returns list of samples that have conflicting codes and sex for code, patient, sample in zip(clinicalDF['ONCOTREE_CODE'], clinicalDF['PATIENT_ID'], clinicalDF['SAMPLE_ID']): if oncotree_mapping_dict.get(code) is not None and \ sum(clinicalDF['PATIENT_ID'] == patient) > 0: primaryCode = oncotree_mapping_dict[code][ 'ONCOTREE_PRIMARY_NODE'] sex = clinicalDF['SEX'][clinicalDF['PATIENT_ID'] == patient].values[0] sex = float('nan') if sex == '' else float(sex) if oncotree_mapping_dict[code][ 'ONCOTREE_PRIMARY_NODE'] in maleOncoCodes and \ sex != 1.0: wrongCodeSamples.append(sample) if oncotree_mapping_dict[code][ 'ONCOTREE_PRIMARY_NODE'] in womenOncoCodes and\ sex != 2.0: wrongCodeSamples.append(sample) if len(wrongCodeSamples) > 0: warning += ( "Sample Clinical File: Some SAMPLE_IDs have " "conflicting SEX and ONCOTREE_CODES: {}\n".format( ",".join(wrongCodeSamples))) else: total_error += \ "Sample Clinical File: Must have ONCOTREE_CODE column.\n" warn, error = process_functions.check_col_and_values( clinicalDF, "SAMPLE_TYPE", sampleType_mapping['CODE'].tolist(), "Sample Clinical File", required=True) total_error += error # CHECK: SEQ_ASSAY_ID haveColumn = \ process_functions.checkColExist(clinicalDF, "SEQ_ASSAY_ID") if haveColumn: if not all([i != "" for i in clinicalDF['SEQ_ASSAY_ID']]): total_error += ( "Sample Clinical File: Please double check your " "SEQ_ASSAY_ID columns, there are empty rows.\n") # must remove empty seq assay ids first # Checking if seq assay ids start with the center name seqAssayIds = \ clinicalDF.SEQ_ASSAY_ID[clinicalDF.SEQ_ASSAY_ID != ""] allSeqAssays = seqAssayIds.unique() notNormalized = [] not_caps = [] for seqassay in allSeqAssays: # SEQ Ids are all capitalized now, so no need to check # for differences in case if not seqassay.upper().startswith(self.center): not_caps.append(seqassay) if len(not_caps) > 0: total_error += ("Sample Clinical File: Please make sure your " "SEQ_ASSAY_IDs start with your center " "abbreviation: {}.\n".format( ", ".join(not_caps))) else: total_error += \ "Sample Clinical File: Must have SEQ_ASSAY_ID column.\n" haveColumn = process_functions.checkColExist(clinicalDF, "SEQ_DATE") seq_date_error = ( "Sample Clinical File: SEQ_DATE must be one of five values- " "For Jan-March: use Jan-YEAR. " "For Apr-June: use Apr-YEAR. " "For July-Sep: use Jul-YEAR. " "For Oct-Dec: use Oct-YEAR. (ie. Apr-2017) " "For values that don't have SEQ_DATES that " "you want released use 'release'.\n") if haveColumn: clinicalDF['SEQ_DATE'] = [ i.title() for i in clinicalDF['SEQ_DATE'].astype(str) ] seqDate = clinicalDF['SEQ_DATE'][ clinicalDF['SEQ_DATE'] != 'Release'] if sum(clinicalDF['SEQ_DATE'] == '') > 0: total_error += ( "Sample Clinical File: Samples without SEQ_DATEs will " "NOT be released.\n") try: if not seqDate.empty: dates = seqDate.apply( lambda date: datetime.datetime.strptime(date, '%b-%Y')) # REMOVE JUN LATER if not all([ i.startswith(("Jul", "Jan", "Oct", "Apr")) for i in seqDate ]): total_error += seq_date_error except ValueError: total_error += seq_date_error else: total_error += "Sample Clinical File: Must have SEQ_DATE column.\n" # CHECK: BIRTH_YEAR birth_year = "BIRTH_YEAR" haveColumn = process_functions.checkColExist(clinicalDF, birth_year) if haveColumn: birth_year_df = \ clinicalDF[~clinicalDF[birth_year].isin(['Unknown'])] # Deal with HIPAA converted rows from DFCI # First for loop can't int(text) because there are # instances that have <YYYY birth_year_df[birth_year] = \ remove_greaterthan_lessthan_str(birth_year_df[birth_year]) try: years = birth_year_df[birth_year].apply( lambda x: datetime.datetime.strptime(str(int( x)), '%Y').year > datetime.datetime.utcnow().year) assert not years.any() except Exception: total_error += ( "Patient Clinical File: Please double check your " "BIRTH_YEAR column, it must be an integer in YYYY format " "> {year} or 'Unknown'.\n".format( year=datetime.datetime.utcnow().year)) else: total_error += \ "Patient Clinical File: Must have BIRTH_YEAR column.\n" # CHECK: VITAL_STATUS # YEAR DEATH haveColumn = process_functions.checkColExist(clinicalDF, "YEAR_DEATH") if haveColumn: notNullYears = clinicalDF.YEAR_DEATH[~clinicalDF.YEAR_DEATH.isin( ['Unknown', 'Not Collected', 'Not Applicable'])] try: notNullYears.apply( lambda x: datetime.datetime.strptime(str(int(x)), '%Y')) except Exception: total_error += ( "Patient Clinical File: Please double check your " "YEAR_DEATH column, it must be an integer in YYYY format, " "'Unknown', 'Not Applicable' or 'Not Collected'.\n") else: total_error += \ "Patient Clinical File: Must have YEAR_DEATH column.\n" # YEAR CONTACT haveColumn = process_functions.checkColExist(clinicalDF, "YEAR_CONTACT") if haveColumn: notNullYears = clinicalDF.YEAR_CONTACT[ ~clinicalDF.YEAR_CONTACT.isin(['Unknown', 'Not Collected'])] try: notNullYears.apply( lambda x: datetime.datetime.strptime(str(int(x)), '%Y')) except Exception: total_error += ( "Patient Clinical File: Please double check your " "YEAR_CONTACT column, it must be an integer in YYYY " "format, 'Unknown' or 'Not Collected'.\n") else: total_error += \ "Patient Clinical File: Must have YEAR_CONTACT column.\n" # INT CONTACT haveColumn = process_functions.checkColExist(clinicalDF, "INT_CONTACT") if haveColumn: if not all([ process_functions.checkInt(i) for i in clinicalDF.INT_CONTACT if i not in ['>32485', '<6570', 'Unknown', 'Not Collected'] ]): total_error += ( "Patient Clinical File: Please double check your " "INT_CONTACT column, it must be an integer, '>32485', " "'<6570', 'Unknown' or 'Not Collected'.\n") else: total_error += \ "Patient Clinical File: Must have INT_CONTACT column.\n" # INT DOD haveColumn = process_functions.checkColExist(clinicalDF, "INT_DOD") if haveColumn: if not all([ process_functions.checkInt(i) for i in clinicalDF.INT_DOD if i not in [ '>32485', '<6570', 'Unknown', 'Not Collected', 'Not Applicable' ] ]): total_error += ( "Patient Clinical File: Please double check your INT_DOD " "column, it must be an integer, '>32485', '<6570', " "'Unknown', 'Not Collected' or 'Not Applicable'.\n") else: total_error += \ "Patient Clinical File: Must have INT_DOD column.\n" haveColumn = process_functions.checkColExist(clinicalDF, "DEAD") if haveColumn: # Need to have check_bool function if not all([ str(i).upper() in ['TRUE', 'FALSE'] for i in clinicalDF.DEAD if i not in ['Unknown', 'Not Collected'] ]): total_error += ( "Patient Clinical File: Please double check your " "DEAD column, it must be True, False, 'Unknown' or " "'Not Collected'.\n") else: total_error += \ "Patient Clinical File: Must have DEAD column.\n" # CHECK: PRIMARY_RACE warn, error = process_functions.check_col_and_values( clinicalDF, "PRIMARY_RACE", race_mapping['CODE'].tolist(), "Patient Clinical File") warning += warn total_error += error # CHECK: SECONDARY_RACE warn, error = process_functions.check_col_and_values( clinicalDF, "SECONDARY_RACE", race_mapping['CODE'].tolist(), "Patient Clinical File") warning += warn total_error += error # CHECK: TERTIARY_RACE warn, error = process_functions.check_col_and_values( clinicalDF, "TERTIARY_RACE", race_mapping['CODE'].tolist(), "Patient Clinical File") warning += warn total_error += error # CHECK: SEX warn, error = process_functions.check_col_and_values( clinicalDF, "SEX", sex_mapping['CODE'].tolist(), "Patient Clinical File", required=True) warning += warn total_error += error # CHECK: ETHNICITY warn, error = process_functions.check_col_and_values( clinicalDF, "ETHNICITY", ethnicity_mapping['CODE'].tolist(), "Patient Clinical File") warning += warn total_error += error return (total_error, warning)
def _validate(self, assay_info_df, project_id): """ Validates the values of assay information file Args: assay_info_df: assay information dataframe Returns: tuple: error and warning """ total_error = "" warning = "" if process_functions.checkColExist(assay_info_df, "SEQ_ASSAY_ID"): all_seq_assays = (assay_info_df.SEQ_ASSAY_ID.replace( { "_": "-" }, regex=True).str.upper().unique()) if not all( [assay.startswith(self.center) for assay in all_seq_assays]): total_error += ( "Assay_information.yaml: Please make sure all your " "SEQ_ASSAY_IDs start with your center abbreviation.\n") db_to_syn_map_df = process_functions.get_synid_database_mappingdf( self.syn, project_id) sample_synid = process_functions.getDatabaseSynId( self.syn, "sample", databaseToSynIdMappingDf=db_to_syn_map_df) uniq_seq_df = process_functions.get_syntabledf( self.syn, f"select distinct(SEQ_ASSAY_ID) as seq from {sample_synid} " f"where CENTER = '{self.center}'", ) # These are all the SEQ_ASSAY_IDs that are in the clinical database # but not in the assay_information file missing_seqs = uniq_seq_df["seq"][ ~uniq_seq_df["seq"].replace({ "_": "-" }, regex=True).str.upper().isin(all_seq_assays)] missing_seqs_str = ", ".join(missing_seqs) if missing_seqs.to_list(): total_error += ( "Assay_information.yaml: You are missing SEQ_ASSAY_IDs: " f"{missing_seqs_str}\n") else: total_error += "Assay_information.yaml: Must have SEQ_ASSAY_ID column.\n" read_group_dict = process_functions.get_gdc_data_dictionary( "read_group") read_group_headers = read_group_dict["properties"] warn, error = process_functions.check_col_and_values( assay_info_df, "is_paired_end", [True, False], filename="Assay_information.yaml", required=True, ) warning += warn total_error += error warn, error = process_functions.check_col_and_values( assay_info_df, "library_selection", read_group_headers["library_selection"]["enum"], filename="Assay_information.yaml", required=True, ) warning += warn total_error += error warn, error = process_functions.check_col_and_values( assay_info_df, "library_strategy", read_group_headers["library_strategy"]["enum"], filename="Assay_information.yaml", required=True, ) warning += warn total_error += error warn, error = process_functions.check_col_and_values( assay_info_df, "platform", read_group_headers["platform"]["enum"], filename="Assay_information.yaml", required=True, ) warning += warn total_error += error instrument_model = read_group_headers["instrument_model"]["enum"] instrument_model.extend(["Illumina NovaSeq 6000", None]) warn, error = process_functions.check_col_and_values( assay_info_df, "instrument_model", instrument_model, filename="Assay_information.yaml", required=True, ) warning += warn total_error += error # target_capture_kit = read_group_headers['target_capture_kit']['enum'] # warn, error = process_functions.check_col_and_values( # assay_info_df, # 'target_capture_kit', # target_capture_kit, # filename="Assay_information.yaml", # required=True) # warning += warn # total_error += error if not process_functions.checkColExist(assay_info_df, "target_capture_kit"): total_error += ("Assay_information.yaml: " "Must have target_capture_kit column.\n") variant_classes = [ "Splice_Site", "Nonsense_Mutation", "Frame_Shift_Del", "Frame_Shift_Ins", "Nonstop_Mutation", "Translation_Start_Site", "In_Frame_Ins", "In_Frame_Del", "Missense_Mutation", "Intron", "Splice_Region", "Silent", "RNA", "5'UTR", "3'UTR", "IGR", "5'Flank", "3'Flank", None, ] warn, error = process_functions.check_col_and_values( assay_info_df, "variant_classifications", variant_classes, filename="Assay_information.yaml", na_allowed=True, sep=";", ) warning += warn total_error += error if process_functions.checkColExist(assay_info_df, "read_length"): if not all([ process_functions.checkInt(i) for i in assay_info_df["read_length"] if i is not None and not pd.isnull(i) ]): total_error += ("Assay_information.yaml: " "Please double check your read_length. " "It must be an integer or null.\n") else: total_error += "Assay_information.yaml: " "Must have read_length column.\n" if process_functions.checkColExist(assay_info_df, "number_of_genes"): if not all([ process_functions.checkInt(i) for i in assay_info_df["number_of_genes"] ]): total_error += ("Assay_information.yaml: " "Please double check your number_of_genes. " "It must be an integer.\n") else: total_error += ("Assay_information.yaml: " "Must have number_of_genes column.\n") if process_functions.checkColExist(assay_info_df, "gene_padding"): if not all([ process_functions.checkInt(i) for i in assay_info_df["gene_padding"] if i is not None and not pd.isnull(i) ]): total_error += ("Assay_information.yaml: " "Please double check your gene_padding. " "It must be an integer or blank.\n") else: warning += ("Assay_information.yaml: " "gene_padding is by default 10 if not specified.\n") warn, error = process_functions.check_col_and_values( assay_info_df, "calling_strategy", ["tumor_only", "tumor_normal", "plasma_normal"], filename="Assay_information.yaml", required=True, ) warning += warn total_error += error if process_functions.checkColExist(assay_info_df, "specimen_tumor_cellularity"): if not all([ i.startswith(">") and i.endswith("%") for i in assay_info_df["specimen_tumor_cellularity"] ]): total_error += ( "Assay_information.yaml: " "Please double check your specimen_tumor_cellularity. " "It must in this format >(num)%. ie. >10%\n") else: total_error += ("Assay_information.yaml: " "Must have specimen_tumor_cellularity column.\n") alteration_types = [ "snv", "small_indels", "gene_level_cna", "intragenic_cna", "structural_variants", ] warn, error = process_functions.check_col_and_values( assay_info_df, "alteration_types", alteration_types, filename="Assay_information.yaml", required=True, sep=";", ) warning += warn total_error += error preservation_technique = ["FFPE", "fresh_frozen", "NA"] warn, error = process_functions.check_col_and_values( assay_info_df, "preservation_technique", preservation_technique, filename="Assay_information.yaml", required=True, sep=";", ) warning += warn total_error += error coverage = ["hotspot_regions", "coding_exons", "introns", "promoters"] warn, error = process_functions.check_col_and_values( assay_info_df, "coverage", coverage, filename="Assay_information.yaml", required=True, sep=";", ) warning += warn total_error += error return total_error, warning
def _validate(self, beddf): """ Validate bed file Args: bed: Bed dataframe Returns: total_error: all the errors warning: all the warnings """ total_error = "" warning = "" newcols = [ "Chromosome", "Start_Position", "End_Position", "Hugo_Symbol", "includeInPanel", ] if len(beddf.columns) < len(newcols): total_error += ( "BED file: Must at least have five columns in this " "order: {}. Make sure there are " "no headers.\n".format(", ".join(newcols))) else: newcols.extend(range(0, len(beddf.columns) - len(newcols))) beddf.columns = newcols to_validate_symbol = True if not all(beddf["Start_Position"].apply( lambda x: isinstance(x, int))): total_error += ("BED file: " "The Start_Position column must only be " "integers. Make sure there are no headers.\n") to_validate_symbol = False if not all( beddf["End_Position"].apply(lambda x: isinstance(x, int))): total_error += ("BED file: " "The End_Position column must only be " "integers. Make sure there are no headers.\n") to_validate_symbol = False LOGGER.info("VALIDATING GENE SYMBOLS") if any(beddf["Hugo_Symbol"].isnull()): total_error += "BED file: You cannot submit any null symbols.\n" beddf = beddf[~beddf["Hugo_Symbol"].isnull()] beddf["Hugo_Symbol"] = [ str(hugo).split(";")[0].split("_")[0].split(":")[0] for hugo in beddf["Hugo_Symbol"] ] if (sum(beddf["Hugo_Symbol"] == "+") != 0 or sum(beddf["Hugo_Symbol"] == "-") != 0): total_error += ("BED file: Fourth column must be the " "Hugo_Symbol column, not the strand column\n") warn, error = process_functions.check_col_and_values( beddf, "includeInPanel", [True, False], filename="BED file", required=True, ) warning += warn total_error += error if to_validate_symbol: gene_position_table = self.syn.tableQuery( "SELECT * FROM syn11806563") gene_positiondf = gene_position_table.asDataFrame() # The apply function of a DataFrame is called twice on the first row (known # pandas behavior) beddf = beddf.apply( lambda x: remap_symbols(x, gene_positiondf), axis=1) if any(beddf["Hugo_Symbol"].isnull()): warning += ("BED file: " "Any gene names that can't be " "remapped will be null.\n") if all(beddf["Hugo_Symbol"].isnull()): total_error += ("BED file: " "You have no correct gene symbols. " "Make sure your gene symbol column (4th " "column) is formatted like so: SYMBOL" "(;optionaltext). Optional text can be " "semi-colon separated.\n") return (total_error, warning)
def _validate(self, bed): ''' Validate bed file Args: bed: Bed dataframe Returns: total_error: all the errors warning: all the warnings ''' total_error = "" warning = "" newCols = [ "Chromosome", "Start_Position", "End_Position", "Hugo_Symbol", "includeInPanel" ] if len(bed.columns) < len(newCols): total_error += ( "BED file: Must at least have five columns in this " "order: {}. Make sure there are " "no headers.\n".format(", ".join(newCols))) else: newCols.extend(range(0, len(bed.columns) - len(newCols))) bed.columns = newCols toValidateSymbol = True if not all( bed['Start_Position'].apply(lambda x: isinstance(x, int))): total_error += ( "BED file: " "The Start_Position column must only be integers. " "Make sure there are no headers.\n") toValidateSymbol = False if not all( bed['End_Position'].apply(lambda x: isinstance(x, int))): total_error += ( "BED file: " "The End_Position column must only be integers. " "Make sure there are no headers.\n") toValidateSymbol = False logger.info("VALIDATING GENE SYMBOLS") if any(bed['Hugo_Symbol'].isnull()): total_error += \ "BED file: You cannot submit any null symbols.\n" bed = bed[~bed['Hugo_Symbol'].isnull()] bed["Hugo_Symbol"] = [ str(hugo).split(";")[0].split("_")[0].split(":")[0] for hugo in bed["Hugo_Symbol"] ] if sum(bed['Hugo_Symbol'] == "+") != 0 or \ sum(bed['Hugo_Symbol'] == "-") != 0: total_error += ( "BED file: Fourth column must be the Hugo_Symbol column, " "not the strand column\n") warn, error = process_functions.check_col_and_values( bed, 'includeInPanel', [True, False], filename="BED file", required=True) warning += warn total_error += error if toValidateSymbol: genePosition = self.syn.tableQuery('SELECT * FROM syn11806563') genePositionDf = genePosition.asDataFrame() bed = bed.apply(lambda x: validateSymbol( x, genePositionDf, returnMappedDf=True), axis=1) if any(bed['Hugo_Symbol'].isnull()): warning += ("BED file: " "Any gene names that can't be " "remapped will be null.\n") if all(bed['Hugo_Symbol'].isnull()): total_error += ( "BED file: " "You have no correct gene symbols. " "Make sure your gene symbol column (4th column) " "is formatted like so: SYMBOL(;optionaltext). " "Optional text can be semi-colon separated.\n") return (total_error, warning)