def _process(self, df): ''' Processing function for Assay information - Standardizes SEQ_ASSAY_ID - Default 10 for gene_padding - Fills in variant_classifications Args: df: Assay information dataframe Returns: dataframe: Processed dataframe ''' seq_assay_ids = [ assay.upper().replace('_', '-') for assay in df['SEQ_ASSAY_ID'] ] df['SEQ_ASSAY_ID'] = seq_assay_ids if process_functions.checkColExist(df, "gene_padding"): df['gene_padding'] = df['gene_padding'].fillna(10) df['gene_padding'] = df['gene_padding'].astype(int) else: df['gene_padding'] = 10 if not process_functions.checkColExist(df, "variant_classifications"): df['variant_classifications'] = pd.np.nan df['CENTER'] = self.center return (df)
def _validate(self, clinicalDF): clinicalDF.columns = [col.upper() for col in clinicalDF.columns] #clinicalDF = clinicalDF.fillna("") total_error = "" warning = "" #CHECK: SAMPLE_ID haveColumn = process_functions.checkColExist(clinicalDF, 'SAMPLE_ID') if not haveColumn: total_error += "nonGENIE_data_clinical.txt: File must have SAMPLE_ID column.\n" else: if sum(clinicalDF['SAMPLE_ID'].isnull()) > 0: total_error += "nonGENIE_data_clinical.txt: There can't be any blank values for SAMPLE_ID\n" #CHECK: SEQ_ASSAY_ID haveColumn = process_functions.checkColExist(clinicalDF, "SEQ_ASSAY_ID") if haveColumn: if sum(clinicalDF['SEQ_ASSAY_ID'].isnull()) > 0: warning += "nonGENIE_data_clinical.txt: Please double check your SEQ_ASSAY_ID columns, there are empty rows.\n" else: total_error += "nonGENIE_data_clinical.txt: File must have SEQ_ASSAY_ID column.\n" #CHECK: PATIENT_ID haveColumn = process_functions.checkColExist(clinicalDF, 'PATIENT_ID') if not haveColumn: total_error += "nonGENIE_data_clinical.txt: File must have PATIENT_ID column.\n" else: if sum(clinicalDF['PATIENT_ID'].isnull()) > 0: total_error += "nonGENIE_data_clinical.txt: There can't be any blank values for PATIENT_ID\n" return (total_error, warning)
def _process(self, df): """ Process assay_information.yaml. Standardizes SEQ_ASSAY_ID, default 10 for gene_padding, and fills in variant_classifications Args: df: Assay information dataframe Returns: dataframe: Processed dataframe """ seq_assay_ids = [ assay.upper().replace("_", "-") for assay in df["SEQ_ASSAY_ID"] ] df["SEQ_ASSAY_ID"] = seq_assay_ids df["SEQ_PIPELINE_ID"] = [ assay.upper().replace("_", "-") for assay in df["SEQ_PIPELINE_ID"] ] if process_functions.checkColExist(df, "gene_padding"): df["gene_padding"] = df["gene_padding"].fillna(10) df["gene_padding"] = df["gene_padding"].astype(int) else: df["gene_padding"] = 10 if not process_functions.checkColExist(df, "variant_classifications"): df["variant_classifications"] = float("nan") df["CENTER"] = self.center return df
def _validate(self, patCountsDf, oncotreeLink): total_error = "" warning = "" # oncotree_mapping = process_functions.get_oncotree_codes(oncotreeLink) # if oncotree_mapping.empty: oncotree_mapping = pd.DataFrame() oncotree_mapping_dict = \ process_functions.get_oncotree_code_mappings(oncotreeLink) oncotree_mapping['ONCOTREE_CODE'] = oncotree_mapping_dict.keys() haveColumn = \ process_functions.checkColExist(patCountsDf, "ONCOTREE_CODE") if haveColumn: if sum(patCountsDf['ONCOTREE_CODE'].duplicated()) > 0: total_error += ( "Patient Counts: " "Must not have any duplicated ONCOTREE CODES.\n") if not all(patCountsDf['ONCOTREE_CODE'].isin( oncotree_mapping['ONCOTREE_CODE'])): unmapped_oncotrees = patCountsDf[ 'ONCOTREE_CODE'][~patCountsDf['ONCOTREE_CODE']. isin(oncotree_mapping['ONCOTREE_CODE'])] total_error += ( "Patient Counts: Please double check that all your " "ONCOTREE CODES exist in the mapping. You have {} codes " "that don't map. These are the codes that " "don't map: {}\n".format(len(unmapped_oncotrees), ",".join( set(unmapped_oncotrees)))) else: total_error += ( "Patient Counts: File must have ONCOTREE_CODE column.\n") haveColumn = process_functions.checkColExist(patCountsDf, "NUM_PATIENTS_PD1_PDL1") if haveColumn: if not all([ isinstance(i, int) for i in patCountsDf['NUM_PATIENTS_PD1_PDL1'] ]): total_error += ( "Patient Counts: Must not have any null values, " "and must be all integers.\n") else: total_error += ("Patient Counts: File must have " "NUM_PATIENTS_PD1_PDL1 column.\n") return (total_error, warning)
def _validate(self, vcf): REQUIRED_HEADERS = pd.Series( ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]) total_error = "" warning = "" if not all(REQUIRED_HEADERS.isin(vcf.columns)): total_error += "Your vcf file must have these headers: CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO.\n" if len(vcf.columns) > 8: if "FORMAT" not in vcf.columns: total_error += "Your vcf file must have FORMAT header if genotype columns exist.\n" #Require that they report variants mapped to either GRCh37 or hg19 without #the chr-prefix. variants on chrM are not supported haveColumn = process_functions.checkColExist(vcf, "#CHROM") if haveColumn: nochr = ["chr" in i for i in vcf['#CHROM'] if isinstance(i, str)] if sum(nochr) > 0: warning += "Your vcf file should not have the chr prefix in front of chromosomes.\n" if sum(vcf['#CHROM'].isin(["chrM"])) > 0: total_error += "Your vcf file must not have variants on chrM.\n" #No white spaces temp = vcf.apply(lambda x: contains_whitespace(x), axis=1) if sum(temp) > 0: warning += "Your vcf file should not have any white spaces in any of the columns.\n" #I can also recommend a `bcftools query` command that will parse a VCF in a detailed way, #and output with warnings or errors if the format is not adhered too return (total_error, warning)
def _check_allele_col(df, col): """ Check the Allele column is correctly formatted. Args: df: mutation dataframe col: Column header name Returns: error, warning """ col_exist = process_functions.checkColExist(df, col) error = "" warning = "" if col_exist: # CHECK: The value "NA" can't be used as a placeholder if sum(df[col].fillna("") == "NA") > 0: warning = ( "maf: " f"{col} column contains 'NA' values, " "which cannot be placeholders for blank values. " "Please put in empty strings for blank values.\n" ) # CHECK: There can't be any null values if sum(df[col].isnull()) > 0: error = f"maf: {col} can't have any blank or null values.\n" return error, warning
def _check_int_year_consistency( clinicaldf: DataFrame, cols: list, string_vals: list ) -> str: """ Check if vital status interval and year columns are consistent in their values Args: clinicaldf: Clinical Data Frame cols: Columns in the clinical data frame string_vals: String values that aren't integers Returns: Error message if values and inconsistent or blank string """ interval_col = "" year_col = "" for col in cols: # This is assuming that interval and year columns start with # INT/YEAR interval_col = col if col.startswith("INT") else interval_col year_col = col if col.startswith("YEAR") else year_col # Return empty string is columns don't exist because this error # is already handled. if not process_functions.checkColExist(clinicaldf, col): return "" is_text_inconsistent = False # Get index of all rows that have 'missing' values for str_val in string_vals: # n string values per row n_str = (clinicaldf[cols] == str_val).sum(axis=1) if n_str.between(0, len(cols), inclusive="neither").any(): is_text_inconsistent = True is_redaction_inconsistent = False # Check that the redacted values are consistent is_redacted_int_89 = clinicaldf[interval_col] == ">32485" is_redacted_year_89 = clinicaldf[year_col] == ">89" is_redacted_int = clinicaldf[interval_col] == "<6570" is_redacted_year = clinicaldf[year_col] == "<18" if any(is_redacted_int != is_redacted_year) or any( is_redacted_int_89 != is_redacted_year_89 ): is_redaction_inconsistent = True col_strs = ", ".join(cols) if is_text_inconsistent and is_redaction_inconsistent: return ( "Patient: you have inconsistent redaction and text " f"values in {col_strs}.\n" ) if is_redaction_inconsistent: return f"Patient: you have inconsistent redaction values in {col_strs}.\n" if is_text_inconsistent: return f"Patient: you have inconsistent text values in {col_strs}.\n" return ""
def _validate(self, mutationDF): """ This function validates the clinical file to make sure it adhere to the clinical SOP. :params filePath: Path to mutation file :returns: Text with all the errors in the clinical file """ first_header = ['CHROMOSOME','HUGO_SYMBOL','TUMOR_SAMPLE_BARCODE'] SP = self._fileType == "mafSP" if SP: correct_column_headers = ['CHROMOSOME','START_POSITION','REFERENCE_ALLELE','TUMOR_SAMPLE_BARCODE','TUMOR_SEQ_ALLELE2'] #T_REF_COUNT + T_ALT_COUNT = T_DEPTH else: correct_column_headers = ['CHROMOSOME','START_POSITION','REFERENCE_ALLELE','TUMOR_SAMPLE_BARCODE','T_ALT_COUNT','TUMOR_SEQ_ALLELE2'] #T_REF_COUNT + T_ALT_COUNT = T_DEPTH optional_headers = ['T_REF_COUNT','N_DEPTH','N_REF_COUNT','N_ALT_COUNT'] mutationDF.columns = [col.upper() for col in mutationDF.columns] total_error = "" warning = "" #CHECK: First column must be in the first_header list if mutationDF.columns[0] not in first_header: total_error += "Mutation File: First column header must be one of these: %s.\n" % ", ".join(first_header) if not process_functions.checkColExist(mutationDF, "T_DEPTH") and not SP: if not process_functions.checkColExist(mutationDF, "T_REF_COUNT"): total_error += "Mutation File: If you are missing T_DEPTH, you must have T_REF_COUNT!\n" #CHECK: Everything in correct_column_headers must be in mutation file if not all([process_functions.checkColExist(mutationDF, i) for i in correct_column_headers]): total_error += "Mutation File: Must at least have these headers: %s.\n" % ",".join([i for i in correct_column_headers if i not in mutationDF.columns.values]) #CHECK: Must have either TUMOR_SEQ_ALLELE2 column if process_functions.checkColExist(mutationDF, "TUMOR_SEQ_ALLELE2"): #CHECK: The value "NA" can't be used as a placeholder if sum(mutationDF["TUMOR_SEQ_ALLELE2"].fillna('') == "NA") > 0: warning += "Mutation File: TUMOR_SEQ_ALLELE2 column contains 'NA' values, which cannot be placeholders for blank values. Please put in empty strings for blank values.\n" #CHECK: There can't be any null values if sum(mutationDF["TUMOR_SEQ_ALLELE2"].isnull()) > 0: total_error += "Mutation File: TUMOR_SEQ_ALLELE2 can't have any null values.\n" #CHECK: Mutation file would benefit from columns in optional_headers if not all([process_functions.checkColExist(mutationDF, i) for i in optional_headers]) and not SP: warning += "Mutation File: Does not have the column headers that can give extra information to the processed mutation file: %s.\n" % ", ".join([i for i in optional_headers if i not in mutationDF.columns.values ]) if process_functions.checkColExist(mutationDF, "REFERENCE_ALLELE"): if sum(mutationDF['REFERENCE_ALLELE'] == "NA") > 0: warning += "Mutation File: Your REFERENCE_ALLELE column contains NA values, which cannot be placeholders for blank values. Please put in empty strings for blank values.\n" #CHECK: mutation file must not have empty reference or variant alleles if sum(mutationDF['REFERENCE_ALLELE'].isnull()) > 0: total_error += "Mutation File: Cannot have any empty REFERENCE_ALLELE values.\n" if process_functions.checkColExist(mutationDF, "CHROMOSOME"): #CHECK: Chromosome column can't have any values that start with chr or have any WT values invalidValues = [str(i).startswith("chr") or str(i) == "WT" for i in mutationDF['CHROMOSOME']] if sum(invalidValues) > 0: total_error += "Mutation File: CHROMOSOME column cannot have any values that start with 'chr' or any 'WT' values.\n" return(total_error, warning)
def _check_tsa1_tsa2(df): """If maf file has both TSA1 and TSA2, TSA1 must equal REF, or TSA1 must equal TSA2. """ tsa2_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE2") tsa1_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE1") ref_col_exist = process_functions.checkColExist(df, "REFERENCE_ALLELE") error = "" if tsa2_col_exist and tsa1_col_exist and ref_col_exist: tsa1_eq_ref = all(df["TUMOR_SEQ_ALLELE1"] == df["REFERENCE_ALLELE"]) tsa1_eq_tsa2 = all(df["TUMOR_SEQ_ALLELE1"] == df["TUMOR_SEQ_ALLELE2"]) if not (tsa1_eq_ref or tsa1_eq_tsa2): error = ( "maf: Contains both " "TUMOR_SEQ_ALLELE1 and TUMOR_SEQ_ALLELE2 columns. " "All values in TUMOR_SEQ_ALLELE1 must match all values in " "REFERENCE_ALLELE or all values in TUMOR_SEQ_ALLELE2.\n" ) return error
def _validate(self, vcfdf): """ Validates the content of a vcf file Args: vcfdf: pandas dataframe containing vcf content Returns: total_error - error messages warning - warning messages """ required_headers = pd.Series( ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]) total_error = "" warning = "" if not all(required_headers.isin(vcfdf.columns)): total_error += ("vcf: Must have these headers: " "CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO.\n") else: # No duplicated values primary_cols = ["#CHROM", "POS", "REF", "ALT"] if vcfdf.duplicated(primary_cols).any(): total_error += "vcf: Must not have duplicate variants.\n" if vcfdf[["#CHROM", "POS"]].isnull().values.any(): total_error += ("vcf: May contain rows that are " "space delimited instead of tab delimited.\n") if len(vcfdf.columns) > 8: if "FORMAT" not in vcfdf.columns: total_error += ("vcf: Must have FORMAT header " "if genotype columns exist.\n") # Require that they report variants mapped to # either GRCh37 or hg19 without # the chr-prefix. variants on chrM are not supported have_column = process_functions.checkColExist(vcfdf, "#CHROM") if have_column: nochr = ["chr" in i for i in vcfdf["#CHROM"] if isinstance(i, str)] if sum(nochr) > 0: warning += ("vcf: Should not have the chr prefix " "in front of chromosomes.\n") if sum(vcfdf["#CHROM"].isin(["chrM"])) > 0: total_error += "vcf: Must not have variants on chrM.\n" # No white spaces white_space = vcfdf.apply(lambda x: contains_whitespace(x), axis=1) if sum(white_space) > 0: warning += ("vcf: Should not have any " "white spaces in any of the columns.\n") # I can also recommend a `bcftools query` command that # will parse a VCF in a detailed way, # and output with warnings or errors if the format is not adhered too return total_error, warning
def _validate(self, vitalStatusDf): total_error = "" warning = "" #PATIENT ID haveColumn = process_functions.checkColExist(vitalStatusDf, "PATIENT_ID") if haveColumn: if vitalStatusDf.PATIENT_ID.isnull().any(): total_error += "Vital status file: Please double check your PATIENT_ID column. No null values allowed.\n" else: total_error += "Vital status file: Must have PATIENT_ID column.\n" #YEAR DEATH haveColumn = process_functions.checkColExist(vitalStatusDf, "YEAR_DEATH") if haveColumn: notNullYears = vitalStatusDf.YEAR_DEATH[~vitalStatusDf.YEAR_DEATH.isnull()] try: notNullYears.apply(lambda x: datetime.datetime.strptime(str(int(x)), '%Y')) except: total_error += "Vital status file: Please double check your YEAR_DEATH column, it must be an integer in YYYY format or an empty string.\n" else: total_error += "Vital status file: Must have YEAR_DEATH column.\n" #YEAR CONTACT haveColumn = process_functions.checkColExist(vitalStatusDf, "YEAR_CONTACT") if haveColumn: notNullYears = vitalStatusDf.YEAR_CONTACT[~vitalStatusDf.YEAR_CONTACT.isnull()] try: notNullYears.apply(lambda x: datetime.datetime.strptime(str(int(x)), '%Y')) except: total_error += "Vital status file: Please double check your YEAR_CONTACT column, it must be an integer in YYYY format or an empty string.\n" else: total_error += "Vital status file: Must have YEAR_CONTACT column.\n" #INT CONTACT haveColumn = process_functions.checkColExist(vitalStatusDf, "INT_CONTACT") if haveColumn: #notNullContact = vitalStatusDf.INT_CONTACT[~vitalStatusDf.INT_CONTACT.isnull()] if not all([process_functions.checkInt(i) for i in vitalStatusDf.INT_CONTACT if not pd.isnull(i) and i not in ['>32485','<6570']]): total_error += "Vital status file: Please double check your INT_CONTACT column, it must be an integer, an empty string, >32485, or <6570.\n" else: total_error += "Vital status file: Must have INT_CONTACT column.\n" #INT DOD haveColumn = process_functions.checkColExist(vitalStatusDf, "INT_DOD") if haveColumn: if not all([process_functions.checkInt(i) for i in vitalStatusDf.INT_DOD if not pd.isnull(i) and i not in ['>32485','<6570']]): total_error += "Vital status file: Please double check your INT_DOD column, it must be an integer, an empty string, >32485, or <6570.\n" else: total_error += "Vital status file: Must have INT_DOD column.\n" haveColumn = process_functions.checkColExist(vitalStatusDf, "DEAD") if haveColumn: if not all([isinstance(i, bool) for i in vitalStatusDf.DEAD if not pd.isnull(i)]): total_error += "Vital status file: Please double check your DEAD column, it must be a boolean value or an empty string.\n" else: total_error += "Vital status file: Must have DEAD column.\n" return(total_error, warning)
def _check_year( clinicaldf: DataFrame, year_col: int, filename: str, allowed_string_values: list = None, ) -> str: """Check year columns Args: clinicaldf: Clinical dataframe year_col: YEAR column filename: Name of file allowed_string_values: list of other allowed string values Returns: Error message """ error = "" if allowed_string_values is None: allowed_string_values = [] if process_functions.checkColExist(clinicaldf, year_col): # Deal with pre-redacted values and other allowed strings # first because can't int(text) because there are # instances that have <YYYY year_series = clinicaldf[year_col][ ~clinicaldf[year_col].isin(allowed_string_values) ] year_now = datetime.datetime.utcnow().year try: years = year_series.apply( lambda x: datetime.datetime.strptime(str(int(x)), "%Y").year > year_now ) # Make sure that none of the years are greater than the current # year. It can be the same, but can't future years. assert not years.any() except Exception: error = ( f"{filename}: Please double check your {year_col} " "column, it must be an integer in YYYY format " f"<= {year_now}" ) # Tack on allowed string values if allowed_string_values: error += " or '{}'.\n".format("', '".join(allowed_string_values)) else: error += ".\n" else: error = f"{filename}: Must have {year_col} column.\n" return error
def _validate(self, cnvDF, noSymbolCheck, testing=False): total_error = "" warning = "" cnvDF.columns = [col.upper() for col in cnvDF.columns] if cnvDF.columns[0] != "HUGO_SYMBOL": total_error += "Your cnv file's first column must be Hugo_Symbol\n" haveColumn = process_functions.checkColExist(cnvDF, "HUGO_SYMBOL") if haveColumn: keepSymbols = cnvDF["HUGO_SYMBOL"] cnvDF.drop("HUGO_SYMBOL", axis=1, inplace=True) # if sum(cnvDF.apply(lambda x: sum(x.isnull()))) > 0: # total_error += "Your cnv file must not have any empty values\n" if process_functions.checkColExist(cnvDF, "ENTREZ_GENE_ID"): del cnvDF['ENTREZ_GENE_ID'] #cnvDF = cnvDF.fillna('') if not all(cnvDF.applymap(lambda x: str(x) in ['-2.0','-2','-1.5','-1.0','-1','0.0','0','0.5','1.0','1','1.5','2','2.0','nan']).all()): total_error += "All values must be NA/blank, -2, -1.5, -1, -0.5, 0, 0.5, 1, 1.5, or 2.\n" else: cnvDF['HUGO_SYMBOL'] = keepSymbols if haveColumn and not noSymbolCheck: #logger.info("VALIDATING %s GENE SYMBOLS" % os.path.basename(filePath)) bedSynId = process_functions.getDatabaseSynId(self.syn, "bed", test=testing) bed = self.syn.tableQuery("select Hugo_Symbol, ID from %s where CENTER = '%s'" % (bedSynId, self.center)) bedDf = bed.asDataFrame() cnvDF['remapped'] = cnvDF['HUGO_SYMBOL'].apply(lambda x: validateSymbol(x, bedDf)) cnvDF = cnvDF[~cnvDF['remapped'].isnull()] #Do not allow any duplicated genes after symbols have been remapped if sum(cnvDF['remapped'].duplicated()) >0: total_error+= "Your CNA file has duplicated Hugo_Symbols (After remapping of genes): %s -> %s.\n" % (",".join(cnvDF['HUGO_SYMBOL'][cnvDF['remapped'].duplicated(keep=False)]), ",".join(cnvDF['remapped'][cnvDF['remapped'].duplicated(keep=False)])) return(total_error, warning)
def _validate(self, fusionDF, noSymbolCheck, testing=False): total_error = "" warning = "" # Frame: "in-frame" or "frameshift". # Fusion_Status (OPTIONAL): An assessment of the mutation type (i.e., "SOMATIC", "GERMLINE", "UNKNOWN", or empty) fusionDF.columns = [col.upper() for col in fusionDF.columns] REQUIRED_HEADERS = pd.Series([ 'HUGO_SYMBOL', 'ENTREZ_GENE_ID', 'CENTER', 'TUMOR_SAMPLE_BARCODE', 'FUSION', 'DNA_SUPPORT', 'RNA_SUPPORT', 'METHOD', 'FRAME' ]) if fusionDF.get("COMMENTS") is None: fusionDF['COMMENTS'] = float('nan') if not all(REQUIRED_HEADERS.isin(fusionDF.columns)): total_error += "Your fusion file must at least have these headers: %s.\n" % ",".join( REQUIRED_HEADERS[~REQUIRED_HEADERS.isin(fusionDF.columns)]) if process_functions.checkColExist( fusionDF, "HUGO_SYMBOL") and not noSymbolCheck: # logger.info("VALIDATING %s GENE SYMBOLS" % os.path.basename(filePath)) #invalidated_genes = fusionDF["HUGO_SYMBOL"].drop_duplicates().apply(validateSymbol) bedSynId = process_functions.getDatabaseSynId(self.syn, "bed", test=testing) bed = self.syn.tableQuery( "select Hugo_Symbol, ID from %s where CENTER = '%s'" % (bedSynId, self.center)) bedDf = bed.asDataFrame() #invalidated_genes = self.pool.map(process_functions.validateSymbol, fusionDF["HUGO_SYMBOL"].drop_duplicates()) fusionDF = fusionDF.drop_duplicates("HUGO_SYMBOL").apply( lambda x: validateSymbol(x, bedDf), axis=1) if fusionDF["HUGO_SYMBOL"].isnull().any(): total_error += "Your fusion file should not have any NA/blank Hugo Symbols.\n" # if process_functions.checkColExist(fusionDF, "DNA_SUPPORT"): # if not fusionDF.DNA_SUPPORT.isin(["yes","no","unknown"]).all(): # total_error += "Your fusion file's DNA_SUPPORT column must be 'yes', 'no', or 'unknown'" # if process_functions.checkColExist(fusionDF, "RNA_SUPPORT"): # if not fusionDF.RNA_SUPPORT.isin(["yes","no","unknown"]).all(): # total_error += "Your fusion file's RNA_SUPPORT column must be 'yes', 'no', or 'unknown'" # if process_functions.checkColExist(fusionDF, "FRAME"): # if not fusionDF.FRAME.isin(["in-frame","frameshift"]).all(): # total_error += "Your fusion file's FRAME column must be 'in-frame', or 'frameshift'" return (total_error, warning)
def _check_int_dead_consistency(clinicaldf: DataFrame) -> str: """Check if vital status interval and dead column are consistent Args: clinicaldf: Clinical Data Frame Returns: Error message if values and inconsistent or blank string """ cols = ["INT_DOD", "DEAD"] for col in cols: # Return empty string is columns don't exist because this error # is already handled. if not process_functions.checkColExist(clinicaldf, col): return "" is_dead = clinicaldf["DEAD"].astype(str) == "True" is_alive = clinicaldf["DEAD"].astype(str) == "False" allowed_str = [ "Unknown", "Not Collected", "Not Applicable", "Not Released", ] is_str = clinicaldf["DEAD"].isin(allowed_str) # Check that all string values are equal each other is_equal = all(clinicaldf.loc[is_str, "DEAD"] == clinicaldf.loc[is_str, "INT_DOD"]) # If dead, int column can't be Not Applicable # If alive, int column must be Not Applicable if ( any(clinicaldf.loc[is_dead, "INT_DOD"] == "Not Applicable") or not all(clinicaldf.loc[is_alive, "INT_DOD"] == "Not Applicable") or not is_equal ): return ( "Patient Clinical File: DEAD value is inconsistent with INT_DOD " "for at least one patient.\n" ) return ""
def _validate(self, mutationDF): """ This function validates the mutation file to make sure it adheres to the mutation SOP. Args: mutationDF: mutation dataframe Returns: Text with all the errors in the mutation file """ first_header = ["CHROMOSOME", "HUGO_SYMBOL", "TUMOR_SAMPLE_BARCODE"] SP = self._fileType == "mafSP" if SP: correct_column_headers = [ "CHROMOSOME", "START_POSITION", "REFERENCE_ALLELE", "TUMOR_SAMPLE_BARCODE", "TUMOR_SEQ_ALLELE2", ] # T_REF_COUNT + T_ALT_COUNT = T_DEPTH else: correct_column_headers = [ "CHROMOSOME", "START_POSITION", "REFERENCE_ALLELE", "TUMOR_SAMPLE_BARCODE", "T_ALT_COUNT", "TUMOR_SEQ_ALLELE2", ] # T_REF_COUNT + T_ALT_COUNT = T_DEPTH optional_headers = ["T_REF_COUNT", "N_DEPTH", "N_REF_COUNT", "N_ALT_COUNT"] mutationDF.columns = [col.upper() for col in mutationDF.columns] # total_error = "" total_error = StringIO() warning = StringIO() # CHECK: Everything in correct_column_headers must be in mutation file if not all( [ process_functions.checkColExist(mutationDF, i) for i in correct_column_headers ] ): total_error.write( "maf: Must at least have these headers: {}. " "If you are writing your maf file with R, please make" "sure to specify the 'quote=FALSE' parameter.\n".format( ",".join( [ i for i in correct_column_headers if i not in mutationDF.columns.values ] ) ) ) else: # CHECK: First column must be in the first_header list if mutationDF.columns[0] not in first_header: total_error.write( "maf: First column header must be " "one of these: {}.\n".format(", ".join(first_header)) ) # No duplicated values primary_cols = [ "CHROMOSOME", "START_POSITION", "REFERENCE_ALLELE", "TUMOR_SAMPLE_BARCODE", "TUMOR_SEQ_ALLELE2", ] # Strip white space if string column for col in primary_cols: if mutationDF[col].dtype == object: mutationDF[col] = mutationDF[col].str.strip() duplicated_idx = mutationDF.duplicated(primary_cols) # Find samples with duplicated variants duplicated_variants = ( mutationDF["TUMOR_SAMPLE_BARCODE"][duplicated_idx] .unique() .astype(str) .tolist() ) if duplicated_idx.any(): total_error.write( "maf: Must not have duplicated variants. " "Samples with duplicated variants: " f"{', '.join(duplicated_variants)}\n" ) t_depth_exists = process_functions.checkColExist(mutationDF, "T_DEPTH") t_ref_exists = process_functions.checkColExist(mutationDF, "T_REF_COUNT") if not t_depth_exists and not t_ref_exists and not SP: total_error.write("maf: If missing T_DEPTH, must have T_REF_COUNT!\n") numerical_cols = [ "T_DEPTH", "T_ALT_COUNT", "T_REF_COUNT", "N_DEPTH", "N_REF_COUNT", "N_ALT_COUNT", ] for col in numerical_cols: col_exists = process_functions.checkColExist(mutationDF, col) if col_exists: # Since NA is an allowed value, when reading in the dataframe # the 'NA' string is not converted. This will convert all # 'NA' values in the numerical columns into actual float('nan') mutationDF.loc[mutationDF[col] == "NA", col] = float("nan") # Attempt to convert column to float try: mutationDF[col] = mutationDF[col].astype(float) except ValueError: pass if mutationDF[col].dtype not in [int, float]: total_error.write(f"maf: {col} must be a numerical column.\n") # CHECK: Must have TUMOR_SEQ_ALLELE2 error, warn = _check_allele_col(mutationDF, "TUMOR_SEQ_ALLELE2") total_error.write(error) warning.write(warn) # CHECK: Mutation file would benefit from columns in optional_headers if ( not all( [ process_functions.checkColExist(mutationDF, i) for i in optional_headers ] ) and not SP ): warning.write( "maf: Does not have the column headers that can give extra " "information to the processed maf: {}.\n".format( ", ".join( [ i for i in optional_headers if i not in mutationDF.columns.values ] ) ) ) # CHECK: Must have REFERENCE_ALLELE error, warn = _check_allele_col(mutationDF, "REFERENCE_ALLELE") total_error.write(error) warning.write(warn) if process_functions.checkColExist(mutationDF, "CHROMOSOME"): # CHECK: Chromosome column can't have any values that start # with chr or have any WT values invalid_values = [ str(i).startswith("chr") or str(i) == "WT" for i in mutationDF["CHROMOSOME"] ] if sum(invalid_values) > 0: total_error.write( "maf: CHROMOSOME column cannot have any values that " "start with 'chr' or any 'WT' values.\n" ) error = _check_tsa1_tsa2(mutationDF) total_error.write(error) return total_error.getvalue(), warning.getvalue()
def _validate(self, cnvDF, nosymbol_check, project_id): total_error = "" warning = "" cnvDF.columns = [col.upper() for col in cnvDF.columns] if cnvDF.columns[0] != "HUGO_SYMBOL": total_error += "Your cnv file's first column must be Hugo_Symbol\n" haveColumn = process_functions.checkColExist(cnvDF, "HUGO_SYMBOL") if haveColumn: keepSymbols = cnvDF["HUGO_SYMBOL"] cnvDF.drop("HUGO_SYMBOL", axis=1, inplace=True) # if sum(cnvDF.apply(lambda x: sum(x.isnull()))) > 0: # total_error += "Your cnv file must not have any empty values\n" if process_functions.checkColExist(cnvDF, "ENTREZ_GENE_ID"): del cnvDF["ENTREZ_GENE_ID"] # cnvDF = cnvDF.fillna('') allowed_values = [ "-2.0", "-2", "-1.5", "-1.0", "-1", "0.0", "0", "0.5", "1.0", "1", "1.5", "2", "2.0", "nan", ] if not all(cnvDF.applymap(lambda x: str(x) in allowed_values).all()): total_error += ("All values must be NA/blank, -2, -1.5, -1, -0.5, " "0, 0.5, 1, 1.5, or 2.\n") else: cnvDF["HUGO_SYMBOL"] = keepSymbols if haveColumn and not nosymbol_check: databaseToSynIdMappingDf = ( process_functions.get_synid_database_mappingdf( self.syn, project_id)) bedSynId = process_functions.getDatabaseSynId( self.syn, "bed", databaseToSynIdMappingDf=databaseToSynIdMappingDf) bed = self.syn.tableQuery( "select Hugo_Symbol, ID from {} where " "CENTER = '{}'".format(bedSynId, self.center)) bedDf = bed.asDataFrame() cnvDF["remapped"] = cnvDF["HUGO_SYMBOL"].apply( lambda x: validateSymbol(x, bedDf)) cnvDF = cnvDF[~cnvDF["remapped"].isnull()] # Do not allow any duplicated genes after symbols # have been remapped if sum(cnvDF["remapped"].duplicated()) > 0: duplicated = cnvDF["remapped"].duplicated(keep=False) total_error += ( "Your CNA file has duplicated Hugo_Symbols " "(After remapping of genes): {} -> {}.\n".format( ",".join(cnvDF["HUGO_SYMBOL"][duplicated]), ",".join(cnvDF["remapped"][duplicated]), )) return (total_error, warning)
def _validate(self, clinicaldf, oncotree_link): """ This function validates the clinical file to make sure it adhere to the clinical SOP. Args: clinicalDF: Merged clinical file with patient and sample information oncotree_link: Link to oncotree Returns: Error message """ total_error = StringIO() warning = StringIO() clinicaldf.columns = [col.upper() for col in clinicaldf.columns] # CHECK: for empty rows empty_rows = clinicaldf.isnull().values.all(axis=1) if empty_rows.any(): total_error.write("Clinical file(s): No empty rows allowed.\n") # Remove completely empty rows to speed up processing clinicaldf = clinicaldf[~empty_rows] clinicaldf = clinicaldf.fillna("") oncotree_mapping_dict = process_functions.get_oncotree_code_mappings( oncotree_link ) oncotree_mapping = pd.DataFrame( {"ONCOTREE_CODE": list(oncotree_mapping_dict.keys())} ) sampletype_mapping = process_functions.getGenieMapping(self.syn, "syn7434273") ethnicity_mapping = process_functions.getGenieMapping(self.syn, "syn7434242") race_mapping = process_functions.getGenieMapping(self.syn, "syn7434236") sex_mapping = process_functions.getGenieMapping(self.syn, "syn7434222") # CHECK: SAMPLE_ID sample_id = "SAMPLE_ID" haveSampleColumn = process_functions.checkColExist(clinicaldf, sample_id) if not haveSampleColumn: total_error.write("Sample Clinical File: Must have SAMPLE_ID column.\n") else: if sum(clinicaldf[sample_id].duplicated()) > 0: total_error.write( "Sample Clinical File: No duplicated SAMPLE_ID " "allowed.\nIf there are no duplicated " "SAMPLE_IDs, and both sample and patient files are " "uploaded, then please check to make sure no duplicated " "PATIENT_IDs exist in the patient clinical file.\n" ) # CHECK: PATIENT_ID patientId = "PATIENT_ID" # #CHECK: PATIENT_ID IN SAMPLE FILE havePatientColumn = process_functions.checkColExist(clinicaldf, patientId) if not havePatientColumn: total_error.write("Patient Clinical File: Must have PATIENT_ID column.\n") # CHECK: within the sample file that the sample ids match # the patient ids if haveSampleColumn and havePatientColumn: # Make sure sample and patient ids are string cols clinicaldf[sample_id] = clinicaldf[sample_id].astype(str) clinicaldf[patientId] = clinicaldf[patientId].astype(str) if not all( [ patient in sample for sample, patient in zip( clinicaldf[sample_id], clinicaldf[patientId] ) ] ): total_error.write( "Sample Clinical File: PATIENT_ID's much be contained in " "the SAMPLE_ID's (ex. SAGE-1 <-> SAGE-1-2)\n" ) # #CHECK: All samples must have associated patient data # (GENIE requires patient data) if not all(clinicaldf[patientId] != ""): total_error.write( "Patient Clinical File: All samples must have associated " "patient information and no null patient ids allowed. " "These samples are missing patient data: {}\n".format( ", ".join( clinicaldf[sample_id][clinicaldf[patientId] == ""].unique() ) ) ) # CHECK: All patients should have associated sample data if not all(clinicaldf[sample_id] != ""): # ## MAKE WARNING FOR NOW### warning.write( "Sample Clinical File: All patients must have associated " "sample information. These patients are missing sample " "data: {}\n".format( ", ".join( clinicaldf[patientId][clinicaldf[sample_id] == ""].unique() ) ) ) # CHECK: AGE_AT_SEQ_REPORT age = "AGE_AT_SEQ_REPORT" haveColumn = process_functions.checkColExist(clinicaldf, age) if haveColumn: # Deal with HIPAA converted rows from DFCI # First for loop can't int(text) because there # are instances that have <3435 age_seq_report_df = clinicaldf[ ~clinicaldf[age].isin(["Unknown", ">32485", "<6570"]) ] # age_seq_report_df[age] = \ # remove_greaterthan_lessthan_str(age_seq_report_df[age]) if not all([process_functions.checkInt(i) for i in age_seq_report_df[age]]): total_error.write( "Sample Clinical File: Please double check your " "AGE_AT_SEQ_REPORT. It must be an integer, 'Unknown', " "'>32485', '<6570'.\n" ) else: age_seq_report_df[age] = age_seq_report_df[age].astype(int) median_age = age_seq_report_df[age].median() if median_age < 100: total_error.write( "Sample Clinical File: Please double check your " "AGE_AT_SEQ_REPORT. You may be reporting this value " "in YEARS, please report in DAYS.\n" ) else: total_error.write( "Sample Clinical File: Must have AGE_AT_SEQ_REPORT column.\n" ) # CHECK: ONCOTREE_CODE haveColumn = process_functions.checkColExist(clinicaldf, "ONCOTREE_CODE") maleOncoCodes = ["TESTIS", "PROSTATE", "PENIS"] womenOncoCodes = ["CERVIX", "VULVA", "UTERUS", "OVARY"] if haveColumn: # Make oncotree codes uppercase (SpCC/SPCC) clinicaldf["ONCOTREE_CODE"] = ( clinicaldf["ONCOTREE_CODE"].astype(str).str.upper() ) oncotree_codes = clinicaldf["ONCOTREE_CODE"][ clinicaldf["ONCOTREE_CODE"] != "UNKNOWN" ] if not all(oncotree_codes.isin(oncotree_mapping["ONCOTREE_CODE"])): unmapped_oncotrees = oncotree_codes[ ~oncotree_codes.isin(oncotree_mapping["ONCOTREE_CODE"]) ] total_error.write( "Sample Clinical File: Please double check that all your " "ONCOTREE CODES exist in the mapping. You have {} samples " "that don't map. These are the codes that " "don't map: {}\n".format( len(unmapped_oncotrees), ",".join(set(unmapped_oncotrees)), ) ) # Should add the SEX mismatch into the dashboard file if ( process_functions.checkColExist(clinicaldf, "SEX") and "oncotree_mapping_dict" in locals() and havePatientColumn and haveSampleColumn ): wrongCodeSamples = [] # This is to check if oncotree codes match the sex, # returns list of samples that have conflicting codes and sex for code, patient, sample in zip( clinicaldf["ONCOTREE_CODE"], clinicaldf["PATIENT_ID"], clinicaldf["SAMPLE_ID"], ): if ( oncotree_mapping_dict.get(code) is not None and sum(clinicaldf["PATIENT_ID"] == patient) > 0 ): primaryCode = oncotree_mapping_dict[code][ "ONCOTREE_PRIMARY_NODE" ] sex = clinicaldf["SEX"][ clinicaldf["PATIENT_ID"] == patient ].values[0] sex = float("nan") if sex == "" else float(sex) if ( oncotree_mapping_dict[code]["ONCOTREE_PRIMARY_NODE"] in maleOncoCodes and sex != 1.0 ): wrongCodeSamples.append(sample) if ( oncotree_mapping_dict[code]["ONCOTREE_PRIMARY_NODE"] in womenOncoCodes and sex != 2.0 ): wrongCodeSamples.append(sample) if len(wrongCodeSamples) > 0: warning.write( "Sample Clinical File: Some SAMPLE_IDs have " "conflicting SEX and ONCOTREE_CODES: {}\n".format( ",".join(wrongCodeSamples) ) ) else: total_error.write("Sample Clinical File: Must have ONCOTREE_CODE column.\n") warn, error = process_functions.check_col_and_values( clinicaldf, "SAMPLE_TYPE", sampletype_mapping["CODE"].tolist(), "Sample Clinical File", required=True, ) total_error.write(error) # CHECK: SEQ_ASSAY_ID haveColumn = process_functions.checkColExist(clinicaldf, "SEQ_ASSAY_ID") if haveColumn: if not all([i != "" for i in clinicaldf["SEQ_ASSAY_ID"]]): total_error.write( "Sample Clinical File: Please double check your " "SEQ_ASSAY_ID columns, there are empty rows.\n" ) # must remove empty seq assay ids first # Checking if seq assay ids start with the center name empty_seq_idx = clinicaldf.SEQ_ASSAY_ID != "" seqassay_ids = clinicaldf.SEQ_ASSAY_ID[empty_seq_idx] uniq_seqassay_ids = seqassay_ids.unique() invalid_seqassay = [] for seqassay in uniq_seqassay_ids: # SEQ Ids are all capitalized now, so no need to check # for differences in case if not seqassay.upper().startswith(self.center): invalid_seqassay.append(seqassay) if invalid_seqassay: total_error.write( "Sample Clinical File: Please make sure your " "SEQ_ASSAY_IDs start with your center " "abbreviation: {}.\n".format(", ".join(invalid_seqassay)) ) else: total_error.write("Sample Clinical File: Must have SEQ_ASSAY_ID column.\n") haveColumn = process_functions.checkColExist(clinicaldf, "SEQ_DATE") seq_date_error = ( "Sample Clinical File: SEQ_DATE must be one of five values- " "For Jan-March: use Jan-YEAR. " "For Apr-June: use Apr-YEAR. " "For July-Sep: use Jul-YEAR. " "For Oct-Dec: use Oct-YEAR. (ie. Apr-2017) " "For values that don't have SEQ_DATES that " "you want released use 'release'.\n" ) if haveColumn: clinicaldf["SEQ_DATE"] = [ i.title() for i in clinicaldf["SEQ_DATE"].astype(str) ] seqdate = clinicaldf["SEQ_DATE"][clinicaldf["SEQ_DATE"] != "Release"] if sum(clinicaldf["SEQ_DATE"] == "") > 0: total_error.write( "Sample Clinical File: Samples without SEQ_DATEs will " "NOT be released.\n" ) try: if not seqdate.empty: seqdate.apply( lambda date: datetime.datetime.strptime(date, "%b-%Y") ) if not seqdate.str.startswith(("Jan", "Apr", "Jul", "Oct")).all(): total_error.write(seq_date_error) except ValueError: total_error.write(seq_date_error) else: total_error.write("Sample Clinical File: Must have SEQ_DATE column.\n") # CHECK: BIRTH_YEAR error = _check_year( clinicaldf=clinicaldf, year_col="BIRTH_YEAR", filename="Patient Clinical File", allowed_string_values=["Unknown", ">89", "<18"], ) total_error.write(error) # CHECK: YEAR DEATH error = _check_year( clinicaldf=clinicaldf, year_col="YEAR_DEATH", filename="Patient Clinical File", allowed_string_values=[ "Unknown", "Not Collected", "Not Applicable", "Not Released", ">89", "<18", ], ) total_error.write(error) # CHECK: YEAR CONTACT error = _check_year( clinicaldf=clinicaldf, year_col="YEAR_CONTACT", filename="Patient Clinical File", allowed_string_values=[ "Unknown", "Not Collected", "Not Released", ">89", "<18", ], ) total_error.write(error) # CHECK: INT CONTACT haveColumn = process_functions.checkColExist(clinicaldf, "INT_CONTACT") if haveColumn: if not all( [ process_functions.checkInt(i) for i in clinicaldf.INT_CONTACT if i not in [ ">32485", "<6570", "Unknown", "Not Collected", "Not Released", ] ] ): total_error.write( "Patient Clinical File: Please double check your " "INT_CONTACT column, it must be an integer, '>32485', " "'<6570', 'Unknown', 'Not Released' or 'Not Collected'.\n" ) else: total_error.write("Patient Clinical File: Must have INT_CONTACT column.\n") # INT DOD haveColumn = process_functions.checkColExist(clinicaldf, "INT_DOD") if haveColumn: if not all( [ process_functions.checkInt(i) for i in clinicaldf.INT_DOD if i not in [ ">32485", "<6570", "Unknown", "Not Collected", "Not Applicable", "Not Released", ] ] ): total_error.write( "Patient Clinical File: Please double check your INT_DOD " "column, it must be an integer, '>32485', '<6570', " "'Unknown', 'Not Collected', 'Not Released' or " "'Not Applicable'.\n" ) else: total_error.write("Patient Clinical File: Must have INT_DOD column.\n") haveColumn = process_functions.checkColExist(clinicaldf, "DEAD") if haveColumn: # Need to have check_bool function if not all( [ str(i).upper() in ["TRUE", "FALSE"] for i in clinicaldf.DEAD if i not in ["Unknown", "Not Collected", "Not Released"] ] ): total_error.write( "Patient Clinical File: Please double check your " "DEAD column, it must be True, False, 'Unknown', " "'Not Released' or 'Not Collected'.\n" ) else: total_error.write("Patient Clinical File: Must have DEAD column.\n") # CHECK: contact vital status value consistency contact_error = _check_int_year_consistency( clinicaldf=clinicaldf, cols=["YEAR_CONTACT", "INT_CONTACT"], string_vals=["Not Collected", "Unknown", "Not Released"], ) total_error.write(contact_error) # CHECK: death vital status value consistency death_error = _check_int_year_consistency( clinicaldf=clinicaldf, cols=["YEAR_DEATH", "INT_DOD"], string_vals=[ "Not Collected", "Unknown", "Not Applicable", "Not Released", ], ) total_error.write(death_error) death_error = _check_int_dead_consistency(clinicaldf=clinicaldf) total_error.write(death_error) # CHECK: SAMPLE_CLASS is optional attribute have_column = process_functions.checkColExist(clinicaldf, "SAMPLE_CLASS") if have_column: sample_class_vals = pd.Series(clinicaldf["SAMPLE_CLASS"].unique().tolist()) if not sample_class_vals.isin(["Tumor", "cfDNA"]).all(): total_error.write( "Sample Clinical File: SAMPLE_CLASS column must " "be 'Tumor', or 'cfDNA'\n" ) # CHECK: PRIMARY_RACE warn, error = process_functions.check_col_and_values( clinicaldf, "PRIMARY_RACE", race_mapping["CODE"].tolist(), "Patient Clinical File", ) warning.write(warn) total_error.write(error) # CHECK: SECONDARY_RACE warn, error = process_functions.check_col_and_values( clinicaldf, "SECONDARY_RACE", race_mapping["CODE"].tolist(), "Patient Clinical File", ) warning.write(warn) total_error.write(error) # CHECK: TERTIARY_RACE warn, error = process_functions.check_col_and_values( clinicaldf, "TERTIARY_RACE", race_mapping["CODE"].tolist(), "Patient Clinical File", ) warning.write(warn) total_error.write(error) # CHECK: SEX warn, error = process_functions.check_col_and_values( clinicaldf, "SEX", sex_mapping["CODE"].tolist(), "Patient Clinical File", required=True, ) warning.write(warn) total_error.write(error) # CHECK: ETHNICITY warn, error = process_functions.check_col_and_values( clinicaldf, "ETHNICITY", ethnicity_mapping["CODE"].tolist(), "Patient Clinical File", ) warning.write(warn) total_error.write(error) return total_error.getvalue(), warning.getvalue()
def _validate(self, assay_info_df, project_id): """ Validates the values of assay information file Args: assay_info_df: assay information dataframe Returns: tuple: error and warning """ total_error = "" warning = "" if process_functions.checkColExist(assay_info_df, "SEQ_ASSAY_ID"): all_seq_assays = (assay_info_df.SEQ_ASSAY_ID.replace( { "_": "-" }, regex=True).str.upper().unique()) if not all( [assay.startswith(self.center) for assay in all_seq_assays]): total_error += ( "Assay_information.yaml: Please make sure all your " "SEQ_ASSAY_IDs start with your center abbreviation.\n") db_to_syn_map_df = process_functions.get_synid_database_mappingdf( self.syn, project_id) sample_synid = process_functions.getDatabaseSynId( self.syn, "sample", databaseToSynIdMappingDf=db_to_syn_map_df) uniq_seq_df = process_functions.get_syntabledf( self.syn, f"select distinct(SEQ_ASSAY_ID) as seq from {sample_synid} " f"where CENTER = '{self.center}'", ) # These are all the SEQ_ASSAY_IDs that are in the clinical database # but not in the assay_information file missing_seqs = uniq_seq_df["seq"][ ~uniq_seq_df["seq"].replace({ "_": "-" }, regex=True).str.upper().isin(all_seq_assays)] missing_seqs_str = ", ".join(missing_seqs) if missing_seqs.to_list(): total_error += ( "Assay_information.yaml: You are missing SEQ_ASSAY_IDs: " f"{missing_seqs_str}\n") else: total_error += "Assay_information.yaml: Must have SEQ_ASSAY_ID column.\n" read_group_dict = process_functions.get_gdc_data_dictionary( "read_group") read_group_headers = read_group_dict["properties"] warn, error = process_functions.check_col_and_values( assay_info_df, "is_paired_end", [True, False], filename="Assay_information.yaml", required=True, ) warning += warn total_error += error warn, error = process_functions.check_col_and_values( assay_info_df, "library_selection", read_group_headers["library_selection"]["enum"], filename="Assay_information.yaml", required=True, ) warning += warn total_error += error warn, error = process_functions.check_col_and_values( assay_info_df, "library_strategy", read_group_headers["library_strategy"]["enum"], filename="Assay_information.yaml", required=True, ) warning += warn total_error += error warn, error = process_functions.check_col_and_values( assay_info_df, "platform", read_group_headers["platform"]["enum"], filename="Assay_information.yaml", required=True, ) warning += warn total_error += error instrument_model = read_group_headers["instrument_model"]["enum"] instrument_model.extend(["Illumina NovaSeq 6000", None]) warn, error = process_functions.check_col_and_values( assay_info_df, "instrument_model", instrument_model, filename="Assay_information.yaml", required=True, ) warning += warn total_error += error # target_capture_kit = read_group_headers['target_capture_kit']['enum'] # warn, error = process_functions.check_col_and_values( # assay_info_df, # 'target_capture_kit', # target_capture_kit, # filename="Assay_information.yaml", # required=True) # warning += warn # total_error += error if not process_functions.checkColExist(assay_info_df, "target_capture_kit"): total_error += ("Assay_information.yaml: " "Must have target_capture_kit column.\n") variant_classes = [ "Splice_Site", "Nonsense_Mutation", "Frame_Shift_Del", "Frame_Shift_Ins", "Nonstop_Mutation", "Translation_Start_Site", "In_Frame_Ins", "In_Frame_Del", "Missense_Mutation", "Intron", "Splice_Region", "Silent", "RNA", "5'UTR", "3'UTR", "IGR", "5'Flank", "3'Flank", None, ] warn, error = process_functions.check_col_and_values( assay_info_df, "variant_classifications", variant_classes, filename="Assay_information.yaml", na_allowed=True, sep=";", ) warning += warn total_error += error if process_functions.checkColExist(assay_info_df, "read_length"): if not all([ process_functions.checkInt(i) for i in assay_info_df["read_length"] if i is not None and not pd.isnull(i) ]): total_error += ("Assay_information.yaml: " "Please double check your read_length. " "It must be an integer or null.\n") else: total_error += "Assay_information.yaml: " "Must have read_length column.\n" if process_functions.checkColExist(assay_info_df, "number_of_genes"): if not all([ process_functions.checkInt(i) for i in assay_info_df["number_of_genes"] ]): total_error += ("Assay_information.yaml: " "Please double check your number_of_genes. " "It must be an integer.\n") else: total_error += ("Assay_information.yaml: " "Must have number_of_genes column.\n") if process_functions.checkColExist(assay_info_df, "gene_padding"): if not all([ process_functions.checkInt(i) for i in assay_info_df["gene_padding"] if i is not None and not pd.isnull(i) ]): total_error += ("Assay_information.yaml: " "Please double check your gene_padding. " "It must be an integer or blank.\n") else: warning += ("Assay_information.yaml: " "gene_padding is by default 10 if not specified.\n") warn, error = process_functions.check_col_and_values( assay_info_df, "calling_strategy", ["tumor_only", "tumor_normal", "plasma_normal"], filename="Assay_information.yaml", required=True, ) warning += warn total_error += error if process_functions.checkColExist(assay_info_df, "specimen_tumor_cellularity"): if not all([ i.startswith(">") and i.endswith("%") for i in assay_info_df["specimen_tumor_cellularity"] ]): total_error += ( "Assay_information.yaml: " "Please double check your specimen_tumor_cellularity. " "It must in this format >(num)%. ie. >10%\n") else: total_error += ("Assay_information.yaml: " "Must have specimen_tumor_cellularity column.\n") alteration_types = [ "snv", "small_indels", "gene_level_cna", "intragenic_cna", "structural_variants", ] warn, error = process_functions.check_col_and_values( assay_info_df, "alteration_types", alteration_types, filename="Assay_information.yaml", required=True, sep=";", ) warning += warn total_error += error preservation_technique = ["FFPE", "fresh_frozen", "NA"] warn, error = process_functions.check_col_and_values( assay_info_df, "preservation_technique", preservation_technique, filename="Assay_information.yaml", required=True, sep=";", ) warning += warn total_error += error coverage = ["hotspot_regions", "coding_exons", "introns", "promoters"] warn, error = process_functions.check_col_and_values( assay_info_df, "coverage", coverage, filename="Assay_information.yaml", required=True, sep=";", ) warning += warn total_error += error return total_error, warning
def _validate(self, assay_info_df): ''' Validates the values of assay information file Args: assay_info_df: assay information dataframe Returns: tuple: error and warning ''' total_error = "" warning = "" if process_functions.checkColExist(assay_info_df, "SEQ_ASSAY_ID"): all_seq_assays = assay_info_df.SEQ_ASSAY_ID.unique() if not all( [assay.startswith(self.center) for assay in all_seq_assays]): total_error += \ "Assay_information.yaml: Please make sure your all your" +\ " SEQ_ASSAY_IDs start with your center abbreviation.\n" else: total_error += \ "Assay_information.yaml: Must have SEQ_ASSAY_ID column.\n" read_group_dict = process_functions.get_gdc_data_dictionary( "read_group") read_group_headers = read_group_dict['properties'] warn, error = process_functions.check_col_and_values( assay_info_df, 'is_paired_end', [True, False], filename="Assay_information.yaml", required=True) warning += warn total_error += error warn, error = process_functions.check_col_and_values( assay_info_df, 'library_selection', read_group_headers['library_selection']['enum'], filename="Assay_information.yaml", required=True) warning += warn total_error += error warn, error = process_functions.check_col_and_values( assay_info_df, 'library_strategy', read_group_headers['library_strategy']['enum'], filename="Assay_information.yaml", required=True) warning += warn total_error += error warn, error = process_functions.check_col_and_values( assay_info_df, 'platform', read_group_headers['platform']['enum'], filename="Assay_information.yaml", required=True) warning += warn total_error += error instrument_model = read_group_headers['instrument_model']['enum'] instrument_model.append(None) warn, error = process_functions.check_col_and_values( assay_info_df, 'instrument_model', instrument_model, filename="Assay_information.yaml", required=True) warning += warn total_error += error variant_classes = \ ['Splice_Site', 'Nonsense_Mutation', 'Frame_Shift_Del', 'Frame_Shift_Ins', 'Nonstop_Mutation', 'Translation_Start_Site', 'In_Frame_Ins', 'In_Frame_Del', 'Missense_Mutation', 'Intron', 'Splice_Region', 'Silent', 'RNA', "5'UTR", "3'UTR", 'IGR', "5'Flank", "3'Flank", None] warn, error = process_functions.check_col_and_values( assay_info_df, 'variant_classifications', variant_classes, filename="Assay_information.yaml", na_allowed=True) warning += warn total_error += error # if not process_functions.checkColExist( # assay_info_df, "target_capture_kit"): # total_error += ("Assay_information.yaml: " # "Must have target_capture_kit column.\n") if process_functions.checkColExist(assay_info_df, "read_length"): if not all([ process_functions.checkInt(i) for i in assay_info_df["read_length"] if i is not None and not pd.isnull(i) ]): total_error += \ ("Assay_information.yaml: " "Please double check your read_length. " "It must be an integer or null.\n") else: total_error += \ ("Assay_information.yaml: " "Must have read_length column.\n") if process_functions.checkColExist(assay_info_df, "number_of_genes"): if not all([ process_functions.checkInt(i) for i in assay_info_df["number_of_genes"] ]): total_error += \ ("Assay_information.yaml: " "Please double check your number_of_genes. " "It must be an integer.\n") else: total_error += \ ("Assay_information.yaml: " "Must have number_of_genes column.\n") if process_functions.checkColExist(assay_info_df, "gene_padding"): if not all([ process_functions.checkInt(i) for i in assay_info_df["gene_padding"] if i is not None and not pd.isnull(i) ]): total_error += \ ("Assay_information.yaml: " "Please double check your gene_padding. " "It must be an integer or blank.\n") else: warning += \ ("Assay_information.yaml: " "gene_padding is by default 10 if not specified.\n") return (total_error, warning)
def _validate(self, clinicalDF, oncotreeLink): """ This function validates the clinical file to make sure it adhere to the clinical SOP. Args: clinicalDF: Merged clinical file with patient and sample information oncotreeLink: Link to oncotree Returns: Error message """ total_error = "" warning = "" clinicalDF.columns = [col.upper() for col in clinicalDF.columns] clinicalDF = clinicalDF.fillna("") # oncotree_mapping = process_functions.get_oncotree_codes(oncotreeLink) # if oncotree_mapping.empty: oncotree_mapping = pd.DataFrame() oncotree_mapping_dict = \ process_functions.get_oncotree_code_mappings(oncotreeLink) oncotree_mapping['ONCOTREE_CODE'] = oncotree_mapping_dict.keys() sampleType_mapping = \ process_functions.getGenieMapping(self.syn, "syn7434273") ethnicity_mapping = \ process_functions.getGenieMapping(self.syn, "syn7434242") race_mapping = \ process_functions.getGenieMapping(self.syn, "syn7434236") sex_mapping = \ process_functions.getGenieMapping(self.syn, "syn7434222") # CHECK: SAMPLE_ID sampleId = 'SAMPLE_ID' haveSampleColumn = \ process_functions.checkColExist(clinicalDF, sampleId) if not haveSampleColumn: total_error += \ "Sample Clinical File: Must have SAMPLE_ID column.\n" else: if sum(clinicalDF[sampleId].duplicated()) > 0: total_error += ( "Sample Clinical File: No duplicated SAMPLE_ID " "allowed.\nIf there are no duplicated " "SAMPLE_IDs, and both sample and patient files are " "uploaded, then please check to make sure no duplicated " "PATIENT_IDs exist in the patient clinical file.\n") # CHECK: PATIENT_ID patientId = "PATIENT_ID" # #CHECK: PATIENT_ID IN SAMPLE FILE havePatientColumn = \ process_functions.checkColExist(clinicalDF, patientId) if not havePatientColumn: total_error += \ "Patient Clinical File: Must have PATIENT_ID column.\n" # CHECK: within the sample file that the sample ids match # the patient ids if haveSampleColumn and havePatientColumn: # Make sure sample and patient ids are string cols clinicalDF[sampleId] = clinicalDF[sampleId].astype(str) clinicalDF[patientId] = clinicalDF[patientId].astype(str) if not all([ patient in sample for sample, patient in zip( clinicalDF[sampleId], clinicalDF[patientId]) ]): total_error += ( "Sample Clinical File: PATIENT_ID's much be contained in " "the SAMPLE_ID's (ex. SAGE-1 <-> SAGE-1-2)\n") # #CHECK: All samples must have associated patient data # (GENIE requires patient data) if not all(clinicalDF[patientId] != ""): total_error += ( "Patient Clinical File: All samples must have associated " "patient information and no null patient ids allowed. " "These samples are missing patient data: {}\n".format( ", ".join( clinicalDF[sampleId][clinicalDF[patientId] == ""])) ) # CHECK: All patients should have associated sample data if not all(clinicalDF[sampleId] != ""): # ## MAKE WARNING FOR NOW### warning += ( "Sample Clinical File: All patients must have associated " "sample information. These patients are missing sample " "data: {}\n".format(", ".join( clinicalDF[patientId][clinicalDF[sampleId] == ""]))) # CHECK: AGE_AT_SEQ_REPORT age = "AGE_AT_SEQ_REPORT" haveColumn = process_functions.checkColExist(clinicalDF, age) if haveColumn: # Deal with HIPAA converted rows from DFCI # First for loop can't int(text) because there # are instances that have <3435 age_seq_report_df = \ clinicalDF[~clinicalDF[age].isin(['Unknown'])] age_seq_report_df[age] = \ remove_greaterthan_lessthan_str(age_seq_report_df[age]) if not all([ process_functions.checkInt(i) for i in age_seq_report_df[age] ]): total_error += ( "Sample Clinical File: Please double check your " "AGE_AT_SEQ_REPORT. It must be an integer or 'Unknown'.\n") else: age_seq_report_df[age] = age_seq_report_df[age].astype(int) median_age = pd.np.median(age_seq_report_df[age]) if median_age < 100: total_error += ( "Sample Clinical File: Please double check your " "AGE_AT_SEQ_REPORT. You may be reporting this value " "in YEARS, please report in DAYS.\n") else: total_error += \ "Sample Clinical File: Must have AGE_AT_SEQ_REPORT column.\n" # CHECK: ONCOTREE_CODE haveColumn = \ process_functions.checkColExist(clinicalDF, "ONCOTREE_CODE") maleOncoCodes = ["TESTIS", "PROSTATE", "PENIS"] womenOncoCodes = ["CERVIX", "VULVA", "UTERUS", "OVARY"] if haveColumn: # Make oncotree codes uppercase (SpCC/SPCC) clinicalDF['ONCOTREE_CODE'] = \ clinicalDF['ONCOTREE_CODE'].astype(str).str.upper() oncotree_codes = clinicalDF['ONCOTREE_CODE'][ clinicalDF['ONCOTREE_CODE'] != "UNKNOWN"] if not all(oncotree_codes.isin(oncotree_mapping['ONCOTREE_CODE'])): unmapped_oncotrees = oncotree_codes[ ~oncotree_codes.isin(oncotree_mapping['ONCOTREE_CODE'])] total_error += ( "Sample Clinical File: Please double check that all your " "ONCOTREE CODES exist in the mapping. You have {} samples " "that don't map. These are the codes that " "don't map: {}\n".format(len(unmapped_oncotrees), ",".join( set(unmapped_oncotrees)))) if process_functions.checkColExist(clinicalDF, "SEX") and \ 'oncotree_mapping_dict' in locals() and \ havePatientColumn and \ haveSampleColumn: wrongCodeSamples = [] # This is to check if oncotree codes match the sex, # returns list of samples that have conflicting codes and sex for code, patient, sample in zip(clinicalDF['ONCOTREE_CODE'], clinicalDF['PATIENT_ID'], clinicalDF['SAMPLE_ID']): if oncotree_mapping_dict.get(code) is not None and \ sum(clinicalDF['PATIENT_ID'] == patient) > 0: primaryCode = oncotree_mapping_dict[code][ 'ONCOTREE_PRIMARY_NODE'] sex = clinicalDF['SEX'][clinicalDF['PATIENT_ID'] == patient].values[0] sex = float('nan') if sex == '' else float(sex) if oncotree_mapping_dict[code][ 'ONCOTREE_PRIMARY_NODE'] in maleOncoCodes and \ sex != 1.0: wrongCodeSamples.append(sample) if oncotree_mapping_dict[code][ 'ONCOTREE_PRIMARY_NODE'] in womenOncoCodes and\ sex != 2.0: wrongCodeSamples.append(sample) if len(wrongCodeSamples) > 0: warning += ( "Sample Clinical File: Some SAMPLE_IDs have " "conflicting SEX and ONCOTREE_CODES: {}\n".format( ",".join(wrongCodeSamples))) else: total_error += \ "Sample Clinical File: Must have ONCOTREE_CODE column.\n" warn, error = process_functions.check_col_and_values( clinicalDF, "SAMPLE_TYPE", sampleType_mapping['CODE'].tolist(), "Sample Clinical File", required=True) total_error += error # CHECK: SEQ_ASSAY_ID haveColumn = \ process_functions.checkColExist(clinicalDF, "SEQ_ASSAY_ID") if haveColumn: if not all([i != "" for i in clinicalDF['SEQ_ASSAY_ID']]): total_error += ( "Sample Clinical File: Please double check your " "SEQ_ASSAY_ID columns, there are empty rows.\n") # must remove empty seq assay ids first # Checking if seq assay ids start with the center name seqAssayIds = \ clinicalDF.SEQ_ASSAY_ID[clinicalDF.SEQ_ASSAY_ID != ""] allSeqAssays = seqAssayIds.unique() notNormalized = [] not_caps = [] for seqassay in allSeqAssays: # SEQ Ids are all capitalized now, so no need to check # for differences in case if not seqassay.upper().startswith(self.center): not_caps.append(seqassay) if len(not_caps) > 0: total_error += ("Sample Clinical File: Please make sure your " "SEQ_ASSAY_IDs start with your center " "abbreviation: {}.\n".format( ", ".join(not_caps))) else: total_error += \ "Sample Clinical File: Must have SEQ_ASSAY_ID column.\n" haveColumn = process_functions.checkColExist(clinicalDF, "SEQ_DATE") seq_date_error = ( "Sample Clinical File: SEQ_DATE must be one of five values- " "For Jan-March: use Jan-YEAR. " "For Apr-June: use Apr-YEAR. " "For July-Sep: use Jul-YEAR. " "For Oct-Dec: use Oct-YEAR. (ie. Apr-2017) " "For values that don't have SEQ_DATES that " "you want released use 'release'.\n") if haveColumn: clinicalDF['SEQ_DATE'] = [ i.title() for i in clinicalDF['SEQ_DATE'].astype(str) ] seqDate = clinicalDF['SEQ_DATE'][ clinicalDF['SEQ_DATE'] != 'Release'] if sum(clinicalDF['SEQ_DATE'] == '') > 0: total_error += ( "Sample Clinical File: Samples without SEQ_DATEs will " "NOT be released.\n") try: if not seqDate.empty: dates = seqDate.apply( lambda date: datetime.datetime.strptime(date, '%b-%Y')) # REMOVE JUN LATER if not all([ i.startswith(("Jul", "Jan", "Oct", "Apr")) for i in seqDate ]): total_error += seq_date_error except ValueError: total_error += seq_date_error else: total_error += "Sample Clinical File: Must have SEQ_DATE column.\n" # CHECK: BIRTH_YEAR birth_year = "BIRTH_YEAR" haveColumn = process_functions.checkColExist(clinicalDF, birth_year) if haveColumn: birth_year_df = \ clinicalDF[~clinicalDF[birth_year].isin(['Unknown'])] # Deal with HIPAA converted rows from DFCI # First for loop can't int(text) because there are # instances that have <YYYY birth_year_df[birth_year] = \ remove_greaterthan_lessthan_str(birth_year_df[birth_year]) try: years = birth_year_df[birth_year].apply( lambda x: datetime.datetime.strptime(str(int( x)), '%Y').year > datetime.datetime.utcnow().year) assert not years.any() except Exception: total_error += ( "Patient Clinical File: Please double check your " "BIRTH_YEAR column, it must be an integer in YYYY format " "> {year} or 'Unknown'.\n".format( year=datetime.datetime.utcnow().year)) else: total_error += \ "Patient Clinical File: Must have BIRTH_YEAR column.\n" # CHECK: VITAL_STATUS # YEAR DEATH haveColumn = process_functions.checkColExist(clinicalDF, "YEAR_DEATH") if haveColumn: notNullYears = clinicalDF.YEAR_DEATH[~clinicalDF.YEAR_DEATH.isin( ['Unknown', 'Not Collected', 'Not Applicable'])] try: notNullYears.apply( lambda x: datetime.datetime.strptime(str(int(x)), '%Y')) except Exception: total_error += ( "Patient Clinical File: Please double check your " "YEAR_DEATH column, it must be an integer in YYYY format, " "'Unknown', 'Not Applicable' or 'Not Collected'.\n") else: total_error += \ "Patient Clinical File: Must have YEAR_DEATH column.\n" # YEAR CONTACT haveColumn = process_functions.checkColExist(clinicalDF, "YEAR_CONTACT") if haveColumn: notNullYears = clinicalDF.YEAR_CONTACT[ ~clinicalDF.YEAR_CONTACT.isin(['Unknown', 'Not Collected'])] try: notNullYears.apply( lambda x: datetime.datetime.strptime(str(int(x)), '%Y')) except Exception: total_error += ( "Patient Clinical File: Please double check your " "YEAR_CONTACT column, it must be an integer in YYYY " "format, 'Unknown' or 'Not Collected'.\n") else: total_error += \ "Patient Clinical File: Must have YEAR_CONTACT column.\n" # INT CONTACT haveColumn = process_functions.checkColExist(clinicalDF, "INT_CONTACT") if haveColumn: if not all([ process_functions.checkInt(i) for i in clinicalDF.INT_CONTACT if i not in ['>32485', '<6570', 'Unknown', 'Not Collected'] ]): total_error += ( "Patient Clinical File: Please double check your " "INT_CONTACT column, it must be an integer, '>32485', " "'<6570', 'Unknown' or 'Not Collected'.\n") else: total_error += \ "Patient Clinical File: Must have INT_CONTACT column.\n" # INT DOD haveColumn = process_functions.checkColExist(clinicalDF, "INT_DOD") if haveColumn: if not all([ process_functions.checkInt(i) for i in clinicalDF.INT_DOD if i not in [ '>32485', '<6570', 'Unknown', 'Not Collected', 'Not Applicable' ] ]): total_error += ( "Patient Clinical File: Please double check your INT_DOD " "column, it must be an integer, '>32485', '<6570', " "'Unknown', 'Not Collected' or 'Not Applicable'.\n") else: total_error += \ "Patient Clinical File: Must have INT_DOD column.\n" haveColumn = process_functions.checkColExist(clinicalDF, "DEAD") if haveColumn: # Need to have check_bool function if not all([ str(i).upper() in ['TRUE', 'FALSE'] for i in clinicalDF.DEAD if i not in ['Unknown', 'Not Collected'] ]): total_error += ( "Patient Clinical File: Please double check your " "DEAD column, it must be True, False, 'Unknown' or " "'Not Collected'.\n") else: total_error += \ "Patient Clinical File: Must have DEAD column.\n" # CHECK: PRIMARY_RACE warn, error = process_functions.check_col_and_values( clinicalDF, "PRIMARY_RACE", race_mapping['CODE'].tolist(), "Patient Clinical File") warning += warn total_error += error # CHECK: SECONDARY_RACE warn, error = process_functions.check_col_and_values( clinicalDF, "SECONDARY_RACE", race_mapping['CODE'].tolist(), "Patient Clinical File") warning += warn total_error += error # CHECK: TERTIARY_RACE warn, error = process_functions.check_col_and_values( clinicalDF, "TERTIARY_RACE", race_mapping['CODE'].tolist(), "Patient Clinical File") warning += warn total_error += error # CHECK: SEX warn, error = process_functions.check_col_and_values( clinicalDF, "SEX", sex_mapping['CODE'].tolist(), "Patient Clinical File", required=True) warning += warn total_error += error # CHECK: ETHNICITY warn, error = process_functions.check_col_and_values( clinicalDF, "ETHNICITY", ethnicity_mapping['CODE'].tolist(), "Patient Clinical File") warning += warn total_error += error return (total_error, warning)