Python checkColExist Examples

Programming Language: Python

Namespace/Package Name: genie.process_functions

Method/Function: checkColExist

Examples at hotexamples.com: 21

Python checkColExist - 21 examples found. These are the top rated real world Python examples of genie.process_functions.checkColExist extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

    def _process(self, df):
        '''
        Processing function for Assay information
        - Standardizes SEQ_ASSAY_ID
        - Default 10 for gene_padding
        - Fills in variant_classifications

        Args:
            df: Assay information dataframe

        Returns:
            dataframe: Processed dataframe
        '''
        seq_assay_ids = [
            assay.upper().replace('_', '-') for assay in df['SEQ_ASSAY_ID']
        ]
        df['SEQ_ASSAY_ID'] = seq_assay_ids
        if process_functions.checkColExist(df, "gene_padding"):
            df['gene_padding'] = df['gene_padding'].fillna(10)
            df['gene_padding'] = df['gene_padding'].astype(int)
        else:
            df['gene_padding'] = 10

        if not process_functions.checkColExist(df, "variant_classifications"):
            df['variant_classifications'] = pd.np.nan

        df['CENTER'] = self.center
        return (df)

Example #2

Show file

    def _validate(self, clinicalDF):
        clinicalDF.columns = [col.upper() for col in clinicalDF.columns]
        #clinicalDF = clinicalDF.fillna("")
        total_error = ""
        warning = ""

        #CHECK: SAMPLE_ID
        haveColumn = process_functions.checkColExist(clinicalDF, 'SAMPLE_ID')
        if not haveColumn:
            total_error += "nonGENIE_data_clinical.txt: File must have SAMPLE_ID column.\n"
        else:
            if sum(clinicalDF['SAMPLE_ID'].isnull()) > 0:
                total_error += "nonGENIE_data_clinical.txt: There can't be any blank values for SAMPLE_ID\n"

        #CHECK: SEQ_ASSAY_ID
        haveColumn = process_functions.checkColExist(clinicalDF,
                                                     "SEQ_ASSAY_ID")
        if haveColumn:
            if sum(clinicalDF['SEQ_ASSAY_ID'].isnull()) > 0:
                warning += "nonGENIE_data_clinical.txt: Please double check your SEQ_ASSAY_ID columns, there are empty rows.\n"
        else:
            total_error += "nonGENIE_data_clinical.txt: File must have SEQ_ASSAY_ID column.\n"

        #CHECK: PATIENT_ID
        haveColumn = process_functions.checkColExist(clinicalDF, 'PATIENT_ID')
        if not haveColumn:
            total_error += "nonGENIE_data_clinical.txt: File must have PATIENT_ID column.\n"
        else:
            if sum(clinicalDF['PATIENT_ID'].isnull()) > 0:
                total_error += "nonGENIE_data_clinical.txt: There can't be any blank values for PATIENT_ID\n"
        return (total_error, warning)

Example #3

Show file

    def _process(self, df):
        """
        Process assay_information.yaml. Standardizes SEQ_ASSAY_ID,
        default 10 for gene_padding, and fills in variant_classifications

        Args:
            df: Assay information dataframe

        Returns:
            dataframe: Processed dataframe
        """
        seq_assay_ids = [
            assay.upper().replace("_", "-") for assay in df["SEQ_ASSAY_ID"]
        ]
        df["SEQ_ASSAY_ID"] = seq_assay_ids
        df["SEQ_PIPELINE_ID"] = [
            assay.upper().replace("_", "-") for assay in df["SEQ_PIPELINE_ID"]
        ]
        if process_functions.checkColExist(df, "gene_padding"):
            df["gene_padding"] = df["gene_padding"].fillna(10)
            df["gene_padding"] = df["gene_padding"].astype(int)
        else:
            df["gene_padding"] = 10

        if not process_functions.checkColExist(df, "variant_classifications"):
            df["variant_classifications"] = float("nan")

        df["CENTER"] = self.center
        return df

Example #4

Show file

    def _validate(self, patCountsDf, oncotreeLink):
        total_error = ""
        warning = ""
        # oncotree_mapping = process_functions.get_oncotree_codes(oncotreeLink)
        # if oncotree_mapping.empty:
        oncotree_mapping = pd.DataFrame()
        oncotree_mapping_dict = \
            process_functions.get_oncotree_code_mappings(oncotreeLink)
        oncotree_mapping['ONCOTREE_CODE'] = oncotree_mapping_dict.keys()

        haveColumn = \
            process_functions.checkColExist(patCountsDf, "ONCOTREE_CODE")

        if haveColumn:
            if sum(patCountsDf['ONCOTREE_CODE'].duplicated()) > 0:
                total_error += (
                    "Patient Counts: "
                    "Must not have any duplicated ONCOTREE CODES.\n")
            if not all(patCountsDf['ONCOTREE_CODE'].isin(
                    oncotree_mapping['ONCOTREE_CODE'])):
                unmapped_oncotrees = patCountsDf[
                    'ONCOTREE_CODE'][~patCountsDf['ONCOTREE_CODE'].
                                     isin(oncotree_mapping['ONCOTREE_CODE'])]
                total_error += (
                    "Patient Counts: Please double check that all your "
                    "ONCOTREE CODES exist in the mapping. You have {} codes "
                    "that don't map. These are the codes that "
                    "don't map: {}\n".format(len(unmapped_oncotrees), ",".join(
                        set(unmapped_oncotrees))))
        else:
            total_error += (
                "Patient Counts: File must have ONCOTREE_CODE column.\n")

        haveColumn = process_functions.checkColExist(patCountsDf,
                                                     "NUM_PATIENTS_PD1_PDL1")

        if haveColumn:
            if not all([
                    isinstance(i, int)
                    for i in patCountsDf['NUM_PATIENTS_PD1_PDL1']
            ]):
                total_error += (
                    "Patient Counts: Must not have any null values, "
                    "and must be all integers.\n")
        else:
            total_error += ("Patient Counts: File must have "
                            "NUM_PATIENTS_PD1_PDL1 column.\n")
        return (total_error, warning)

Example #5

Show file

File: vcf.py Project: jaybee84/Genie

    def _validate(self, vcf):
        REQUIRED_HEADERS = pd.Series(
            ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"])

        total_error = ""
        warning = ""
        if not all(REQUIRED_HEADERS.isin(vcf.columns)):
            total_error += "Your vcf file must have these headers: CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO.\n"

        if len(vcf.columns) > 8:
            if "FORMAT" not in vcf.columns:
                total_error += "Your vcf file must have FORMAT header if genotype columns exist.\n"

        #Require that they report variants mapped to either GRCh37 or hg19 without
        #the chr-prefix. variants on chrM are not supported
        haveColumn = process_functions.checkColExist(vcf, "#CHROM")
        if haveColumn:
            nochr = ["chr" in i for i in vcf['#CHROM'] if isinstance(i, str)]
            if sum(nochr) > 0:
                warning += "Your vcf file should not have the chr prefix in front of chromosomes.\n"
            if sum(vcf['#CHROM'].isin(["chrM"])) > 0:
                total_error += "Your vcf file must not have variants on chrM.\n"

        #No white spaces
        temp = vcf.apply(lambda x: contains_whitespace(x), axis=1)
        if sum(temp) > 0:
            warning += "Your vcf file should not have any white spaces in any of the columns.\n"
        #I can also recommend a `bcftools query` command that will parse a VCF in a detailed way,
        #and output with warnings or errors if the format is not adhered too
        return (total_error, warning)

Example #6

Show file

def _check_allele_col(df, col):
    """
    Check the Allele column is correctly formatted.

    Args:
        df: mutation dataframe
        col: Column header name

    Returns:
        error, warning

    """
    col_exist = process_functions.checkColExist(df, col)
    error = ""
    warning = ""
    if col_exist:
        # CHECK: The value "NA" can't be used as a placeholder
        if sum(df[col].fillna("") == "NA") > 0:
            warning = (
                "maf: "
                f"{col} column contains 'NA' values, "
                "which cannot be placeholders for blank values.  "
                "Please put in empty strings for blank values.\n"
            )
        # CHECK: There can't be any null values
        if sum(df[col].isnull()) > 0:
            error = f"maf: {col} can't have any blank or null values.\n"

    return error, warning

Example #7

Show file

File: clinical.py Project: Sage-Bionetworks/Genie

def _check_int_year_consistency(
    clinicaldf: DataFrame, cols: list, string_vals: list
) -> str:
    """
    Check if vital status interval and year columns are consistent in
    their values

    Args:
        clinicaldf: Clinical Data Frame
        cols: Columns in the clinical data frame
        string_vals: String values that aren't integers

    Returns:
        Error message if values and inconsistent or blank string
    """
    interval_col = ""
    year_col = ""
    for col in cols:
        # This is assuming that interval and year columns start with
        # INT/YEAR
        interval_col = col if col.startswith("INT") else interval_col
        year_col = col if col.startswith("YEAR") else year_col
        # Return empty string is columns don't exist because this error
        # is already handled.
        if not process_functions.checkColExist(clinicaldf, col):
            return ""

    is_text_inconsistent = False
    # Get index of all rows that have 'missing' values
    for str_val in string_vals:
        # n string values per row
        n_str = (clinicaldf[cols] == str_val).sum(axis=1)
        if n_str.between(0, len(cols), inclusive="neither").any():
            is_text_inconsistent = True

    is_redaction_inconsistent = False
    # Check that the redacted values are consistent
    is_redacted_int_89 = clinicaldf[interval_col] == ">32485"
    is_redacted_year_89 = clinicaldf[year_col] == ">89"
    is_redacted_int = clinicaldf[interval_col] == "<6570"
    is_redacted_year = clinicaldf[year_col] == "<18"
    if any(is_redacted_int != is_redacted_year) or any(
        is_redacted_int_89 != is_redacted_year_89
    ):
        is_redaction_inconsistent = True

    col_strs = ", ".join(cols)
    if is_text_inconsistent and is_redaction_inconsistent:
        return (
            "Patient: you have inconsistent redaction and text "
            f"values in {col_strs}.\n"
        )
    if is_redaction_inconsistent:
        return f"Patient: you have inconsistent redaction values in {col_strs}.\n"
    if is_text_inconsistent:
        return f"Patient: you have inconsistent text values in {col_strs}.\n"

    return ""

Example #8

Show file

File: maf.py Project: sgosline/Genie

    def _validate(self, mutationDF):
        """
        This function validates the clinical file to make sure it adhere to the clinical SOP.
        
        :params filePath:     Path to mutation file
        :returns:             Text with all the errors in the clinical file
        """

        first_header = ['CHROMOSOME','HUGO_SYMBOL','TUMOR_SAMPLE_BARCODE']
        SP = self._fileType == "mafSP"
        if SP:
            correct_column_headers = ['CHROMOSOME','START_POSITION','REFERENCE_ALLELE','TUMOR_SAMPLE_BARCODE','TUMOR_SEQ_ALLELE2'] #T_REF_COUNT + T_ALT_COUNT = T_DEPTH
        else:
            correct_column_headers = ['CHROMOSOME','START_POSITION','REFERENCE_ALLELE','TUMOR_SAMPLE_BARCODE','T_ALT_COUNT','TUMOR_SEQ_ALLELE2'] #T_REF_COUNT + T_ALT_COUNT = T_DEPTH
        optional_headers = ['T_REF_COUNT','N_DEPTH','N_REF_COUNT','N_ALT_COUNT']
        
        mutationDF.columns = [col.upper() for col in mutationDF.columns]

        total_error = ""
        warning = ""
        #CHECK: First column must be in the first_header list
        if mutationDF.columns[0] not in first_header:
            total_error += "Mutation File: First column header must be one of these: %s.\n" % ", ".join(first_header)
        
        if not process_functions.checkColExist(mutationDF, "T_DEPTH") and not SP:
            if not process_functions.checkColExist(mutationDF, "T_REF_COUNT"):
                total_error += "Mutation File: If you are missing T_DEPTH, you must have T_REF_COUNT!\n"

        #CHECK: Everything in correct_column_headers must be in mutation file
        if not all([process_functions.checkColExist(mutationDF, i) for i in correct_column_headers]):
            total_error += "Mutation File: Must at least have these headers: %s.\n" % ",".join([i for i in correct_column_headers if i not in mutationDF.columns.values])
        
        #CHECK: Must have either TUMOR_SEQ_ALLELE2 column
        if process_functions.checkColExist(mutationDF, "TUMOR_SEQ_ALLELE2"):
            #CHECK: The value "NA" can't be used as a placeholder
            if sum(mutationDF["TUMOR_SEQ_ALLELE2"].fillna('') == "NA") > 0:
                warning += "Mutation File: TUMOR_SEQ_ALLELE2 column contains 'NA' values, which cannot be placeholders for blank values.  Please put in empty strings for blank values.\n"
            #CHECK: There can't be any null values
            if sum(mutationDF["TUMOR_SEQ_ALLELE2"].isnull()) > 0:
                total_error += "Mutation File: TUMOR_SEQ_ALLELE2 can't have any null values.\n"
        
        #CHECK: Mutation file would benefit from columns in optional_headers
        if not all([process_functions.checkColExist(mutationDF, i) for i in optional_headers]) and not SP:
            warning += "Mutation File: Does not have the column headers that can give extra information to the processed mutation file: %s.\n" % ", ".join([i for i in optional_headers if i not in mutationDF.columns.values ])      

        if process_functions.checkColExist(mutationDF, "REFERENCE_ALLELE"):
            if sum(mutationDF['REFERENCE_ALLELE'] == "NA") > 0:
                warning += "Mutation File: Your REFERENCE_ALLELE column contains NA values, which cannot be placeholders for blank values.  Please put in empty strings for blank values.\n"
            #CHECK: mutation file must not have empty reference or variant alleles
            if sum(mutationDF['REFERENCE_ALLELE'].isnull()) > 0:
                total_error += "Mutation File: Cannot have any empty REFERENCE_ALLELE values.\n"

        if process_functions.checkColExist(mutationDF, "CHROMOSOME"):
            #CHECK: Chromosome column can't have any values that start with chr or have any WT values
            invalidValues = [str(i).startswith("chr") or str(i) == "WT" for i in mutationDF['CHROMOSOME']]
            if sum(invalidValues) > 0:
                total_error += "Mutation File: CHROMOSOME column cannot have any values that start with 'chr' or any 'WT' values.\n"


        return(total_error, warning)

Example #9

Show file

def _check_tsa1_tsa2(df):
    """If maf file has both TSA1 and TSA2,
    TSA1 must equal REF, or TSA1 must equal TSA2.
    """
    tsa2_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE2")
    tsa1_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE1")
    ref_col_exist = process_functions.checkColExist(df, "REFERENCE_ALLELE")
    error = ""
    if tsa2_col_exist and tsa1_col_exist and ref_col_exist:
        tsa1_eq_ref = all(df["TUMOR_SEQ_ALLELE1"] == df["REFERENCE_ALLELE"])
        tsa1_eq_tsa2 = all(df["TUMOR_SEQ_ALLELE1"] == df["TUMOR_SEQ_ALLELE2"])
        if not (tsa1_eq_ref or tsa1_eq_tsa2):
            error = (
                "maf: Contains both "
                "TUMOR_SEQ_ALLELE1 and TUMOR_SEQ_ALLELE2 columns. "
                "All values in TUMOR_SEQ_ALLELE1 must match all values in "
                "REFERENCE_ALLELE or all values in TUMOR_SEQ_ALLELE2.\n"
            )
    return error

Example #10

Show file

File: vcf.py Project: Sage-Bionetworks/Genie

    def _validate(self, vcfdf):
        """
        Validates the content of a vcf file

        Args:
            vcfdf: pandas dataframe containing vcf content

        Returns:
            total_error - error messages
            warning - warning messages
        """
        required_headers = pd.Series(
            ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"])
        total_error = ""
        warning = ""
        if not all(required_headers.isin(vcfdf.columns)):
            total_error += ("vcf: Must have these headers: "
                            "CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO.\n")
        else:
            # No duplicated values
            primary_cols = ["#CHROM", "POS", "REF", "ALT"]
            if vcfdf.duplicated(primary_cols).any():
                total_error += "vcf: Must not have duplicate variants.\n"

            if vcfdf[["#CHROM", "POS"]].isnull().values.any():
                total_error += ("vcf: May contain rows that are "
                                "space delimited instead of tab delimited.\n")

        if len(vcfdf.columns) > 8:
            if "FORMAT" not in vcfdf.columns:
                total_error += ("vcf: Must have FORMAT header "
                                "if genotype columns exist.\n")

        # Require that they report variants mapped to
        # either GRCh37 or hg19 without
        # the chr-prefix. variants on chrM are not supported
        have_column = process_functions.checkColExist(vcfdf, "#CHROM")
        if have_column:
            nochr = ["chr" in i for i in vcfdf["#CHROM"] if isinstance(i, str)]
            if sum(nochr) > 0:
                warning += ("vcf: Should not have the chr prefix "
                            "in front of chromosomes.\n")
            if sum(vcfdf["#CHROM"].isin(["chrM"])) > 0:
                total_error += "vcf: Must not have variants on chrM.\n"

        # No white spaces
        white_space = vcfdf.apply(lambda x: contains_whitespace(x), axis=1)
        if sum(white_space) > 0:
            warning += ("vcf: Should not have any "
                        "white spaces in any of the columns.\n")

        # I can also recommend a `bcftools query` command that
        # will parse a VCF in a detailed way,
        # and output with warnings or errors if the format is not adhered too
        return total_error, warning

Example #11

Show file

    def _validate(self, vitalStatusDf):
        total_error = ""
        warning = ""

        #PATIENT ID
        haveColumn = process_functions.checkColExist(vitalStatusDf, "PATIENT_ID")
        if haveColumn:
            if vitalStatusDf.PATIENT_ID.isnull().any():
                total_error += "Vital status file: Please double check your PATIENT_ID column. No null values allowed.\n"
        else:
            total_error += "Vital status file: Must have PATIENT_ID column.\n"

        #YEAR DEATH
        haveColumn = process_functions.checkColExist(vitalStatusDf, "YEAR_DEATH")
        if haveColumn:
            notNullYears = vitalStatusDf.YEAR_DEATH[~vitalStatusDf.YEAR_DEATH.isnull()]
            try:
                notNullYears.apply(lambda x: datetime.datetime.strptime(str(int(x)), '%Y'))
            except:
                total_error += "Vital status file: Please double check your YEAR_DEATH column, it must be an integer in YYYY format or an empty string.\n"
        else:
            total_error += "Vital status file: Must have YEAR_DEATH column.\n"

        #YEAR CONTACT
        haveColumn = process_functions.checkColExist(vitalStatusDf, "YEAR_CONTACT")
        if haveColumn:
            notNullYears = vitalStatusDf.YEAR_CONTACT[~vitalStatusDf.YEAR_CONTACT.isnull()]
            try:
                notNullYears.apply(lambda x: datetime.datetime.strptime(str(int(x)), '%Y'))
            except:
                total_error += "Vital status file: Please double check your YEAR_CONTACT column, it must be an integer in YYYY format or an empty string.\n"
        else:
            total_error += "Vital status file: Must have YEAR_CONTACT column.\n"

        #INT CONTACT
        haveColumn = process_functions.checkColExist(vitalStatusDf, "INT_CONTACT")
        if haveColumn:
            #notNullContact = vitalStatusDf.INT_CONTACT[~vitalStatusDf.INT_CONTACT.isnull()]
            if not all([process_functions.checkInt(i) for i in vitalStatusDf.INT_CONTACT if not pd.isnull(i) and i not in ['>32485','<6570']]):
                total_error += "Vital status file: Please double check your INT_CONTACT column, it must be an integer, an empty string, >32485, or <6570.\n"
        else:
            total_error += "Vital status file: Must have INT_CONTACT column.\n"

        #INT DOD
        haveColumn = process_functions.checkColExist(vitalStatusDf, "INT_DOD")
        if haveColumn:
            if not all([process_functions.checkInt(i) for i in vitalStatusDf.INT_DOD if not pd.isnull(i) and i not in ['>32485','<6570']]):
                total_error += "Vital status file: Please double check your INT_DOD column, it must be an integer, an empty string, >32485, or <6570.\n"
        else:
            total_error += "Vital status file: Must have INT_DOD column.\n"

        haveColumn = process_functions.checkColExist(vitalStatusDf, "DEAD")
        if haveColumn:
            if not all([isinstance(i, bool) for i in vitalStatusDf.DEAD if not pd.isnull(i)]):
                total_error += "Vital status file: Please double check your DEAD column, it must be a boolean value or an empty string.\n"
        else:
            total_error += "Vital status file: Must have DEAD column.\n"

        return(total_error, warning)

Example #12

Show file

File: clinical.py Project: Sage-Bionetworks/Genie

def _check_year(
    clinicaldf: DataFrame,
    year_col: int,
    filename: str,
    allowed_string_values: list = None,
) -> str:
    """Check year columns

    Args:
        clinicaldf: Clinical dataframe
        year_col: YEAR column
        filename: Name of file
        allowed_string_values: list of other allowed string values

    Returns:
        Error message
    """
    error = ""
    if allowed_string_values is None:
        allowed_string_values = []
    if process_functions.checkColExist(clinicaldf, year_col):
        # Deal with pre-redacted values and other allowed strings
        # first because can't int(text) because there are
        # instances that have <YYYY
        year_series = clinicaldf[year_col][
            ~clinicaldf[year_col].isin(allowed_string_values)
        ]
        year_now = datetime.datetime.utcnow().year
        try:
            years = year_series.apply(
                lambda x: datetime.datetime.strptime(str(int(x)), "%Y").year > year_now
            )
            # Make sure that none of the years are greater than the current
            # year.  It can be the same, but can't future years.
            assert not years.any()
        except Exception:
            error = (
                f"{filename}: Please double check your {year_col} "
                "column, it must be an integer in YYYY format "
                f"<= {year_now}"
            )
            # Tack on allowed string values
            if allowed_string_values:
                error += " or '{}'.\n".format("', '".join(allowed_string_values))
            else:
                error += ".\n"
    else:
        error = f"{filename}: Must have {year_col} column.\n"

    return error

Example #13

Show file

File: cna.py Project: sgosline/Genie

    def _validate(self, cnvDF, noSymbolCheck, testing=False):
        total_error = ""
        warning = ""
        cnvDF.columns = [col.upper() for col in cnvDF.columns]

        if cnvDF.columns[0] != "HUGO_SYMBOL":
            total_error += "Your cnv file's first column must be Hugo_Symbol\n"
        haveColumn = process_functions.checkColExist(cnvDF, "HUGO_SYMBOL")
        if haveColumn:
            keepSymbols = cnvDF["HUGO_SYMBOL"]
            cnvDF.drop("HUGO_SYMBOL", axis=1, inplace=True)

        # if sum(cnvDF.apply(lambda x: sum(x.isnull()))) > 0:
        #   total_error += "Your cnv file must not have any empty values\n"

        if process_functions.checkColExist(cnvDF, "ENTREZ_GENE_ID"):
            del cnvDF['ENTREZ_GENE_ID']
        
        #cnvDF = cnvDF.fillna('')
        if not all(cnvDF.applymap(lambda x: str(x) in ['-2.0','-2','-1.5','-1.0','-1','0.0','0','0.5','1.0','1','1.5','2','2.0','nan']).all()):
            total_error += "All values must be NA/blank, -2, -1.5, -1, -0.5, 0, 0.5, 1, 1.5, or 2.\n"
        else:
            cnvDF['HUGO_SYMBOL'] = keepSymbols
            if haveColumn and not noSymbolCheck:
                #logger.info("VALIDATING %s GENE SYMBOLS" % os.path.basename(filePath))

                bedSynId = process_functions.getDatabaseSynId(self.syn, "bed", test=testing)
                bed = self.syn.tableQuery("select Hugo_Symbol, ID from %s where CENTER = '%s'" % (bedSynId, self.center))
                bedDf = bed.asDataFrame()
                cnvDF['remapped'] = cnvDF['HUGO_SYMBOL'].apply(lambda x: validateSymbol(x, bedDf))
                cnvDF = cnvDF[~cnvDF['remapped'].isnull()]

                #Do not allow any duplicated genes after symbols have been remapped
                if sum(cnvDF['remapped'].duplicated()) >0:
                    total_error+= "Your CNA file has duplicated Hugo_Symbols (After remapping of genes): %s -> %s.\n" % (",".join(cnvDF['HUGO_SYMBOL'][cnvDF['remapped'].duplicated(keep=False)]), ",".join(cnvDF['remapped'][cnvDF['remapped'].duplicated(keep=False)]))
        return(total_error, warning)

Example #14

Show file

    def _validate(self, fusionDF, noSymbolCheck, testing=False):
        total_error = ""
        warning = ""

        # Frame: "in-frame" or "frameshift".
        # Fusion_Status (OPTIONAL): An assessment of the mutation type (i.e., "SOMATIC", "GERMLINE", "UNKNOWN", or empty)

        fusionDF.columns = [col.upper() for col in fusionDF.columns]

        REQUIRED_HEADERS = pd.Series([
            'HUGO_SYMBOL', 'ENTREZ_GENE_ID', 'CENTER', 'TUMOR_SAMPLE_BARCODE',
            'FUSION', 'DNA_SUPPORT', 'RNA_SUPPORT', 'METHOD', 'FRAME'
        ])
        if fusionDF.get("COMMENTS") is None:
            fusionDF['COMMENTS'] = float('nan')
        if not all(REQUIRED_HEADERS.isin(fusionDF.columns)):
            total_error += "Your fusion file must at least have these headers: %s.\n" % ",".join(
                REQUIRED_HEADERS[~REQUIRED_HEADERS.isin(fusionDF.columns)])
        if process_functions.checkColExist(
                fusionDF, "HUGO_SYMBOL") and not noSymbolCheck:
            # logger.info("VALIDATING %s GENE SYMBOLS" % os.path.basename(filePath))
            #invalidated_genes = fusionDF["HUGO_SYMBOL"].drop_duplicates().apply(validateSymbol)
            bedSynId = process_functions.getDatabaseSynId(self.syn,
                                                          "bed",
                                                          test=testing)
            bed = self.syn.tableQuery(
                "select Hugo_Symbol, ID from %s where CENTER = '%s'" %
                (bedSynId, self.center))
            bedDf = bed.asDataFrame()
            #invalidated_genes = self.pool.map(process_functions.validateSymbol, fusionDF["HUGO_SYMBOL"].drop_duplicates())
            fusionDF = fusionDF.drop_duplicates("HUGO_SYMBOL").apply(
                lambda x: validateSymbol(x, bedDf), axis=1)
            if fusionDF["HUGO_SYMBOL"].isnull().any():
                total_error += "Your fusion file should not have any NA/blank Hugo Symbols.\n"

        # if process_functions.checkColExist(fusionDF, "DNA_SUPPORT"):
        #     if not fusionDF.DNA_SUPPORT.isin(["yes","no","unknown"]).all():
        #         total_error += "Your fusion file's DNA_SUPPORT column must be 'yes', 'no', or 'unknown'"

        # if process_functions.checkColExist(fusionDF, "RNA_SUPPORT"):
        #     if not fusionDF.RNA_SUPPORT.isin(["yes","no","unknown"]).all():
        #         total_error += "Your fusion file's RNA_SUPPORT column must be 'yes', 'no', or 'unknown'"

        # if process_functions.checkColExist(fusionDF, "FRAME"):
        #     if not fusionDF.FRAME.isin(["in-frame","frameshift"]).all():
        #         total_error += "Your fusion file's FRAME column must be 'in-frame', or 'frameshift'"

        return (total_error, warning)

Example #15

Show file

File: clinical.py Project: Sage-Bionetworks/Genie

def _check_int_dead_consistency(clinicaldf: DataFrame) -> str:
    """Check if vital status interval and dead column are consistent

    Args:
        clinicaldf: Clinical Data Frame

    Returns:
        Error message if values and inconsistent or blank string
    """
    cols = ["INT_DOD", "DEAD"]
    for col in cols:
        # Return empty string is columns don't exist because this error
        # is already handled.
        if not process_functions.checkColExist(clinicaldf, col):
            return ""
    is_dead = clinicaldf["DEAD"].astype(str) == "True"
    is_alive = clinicaldf["DEAD"].astype(str) == "False"
    allowed_str = [
        "Unknown",
        "Not Collected",
        "Not Applicable",
        "Not Released",
    ]
    is_str = clinicaldf["DEAD"].isin(allowed_str)
    # Check that all string values are equal each other
    is_equal = all(clinicaldf.loc[is_str, "DEAD"] == clinicaldf.loc[is_str, "INT_DOD"])
    # If dead, int column can't be Not Applicable
    # If alive, int column must be Not Applicable
    if (
        any(clinicaldf.loc[is_dead, "INT_DOD"] == "Not Applicable")
        or not all(clinicaldf.loc[is_alive, "INT_DOD"] == "Not Applicable")
        or not is_equal
    ):
        return (
            "Patient Clinical File: DEAD value is inconsistent with INT_DOD "
            "for at least one patient.\n"
        )
    return ""

Example #16

Show file

    def _validate(self, mutationDF):
        """
        This function validates the mutation file to make sure it
        adheres to the mutation SOP.

        Args:
            mutationDF: mutation dataframe

        Returns:
            Text with all the errors in the mutation file
        """

        first_header = ["CHROMOSOME", "HUGO_SYMBOL", "TUMOR_SAMPLE_BARCODE"]
        SP = self._fileType == "mafSP"
        if SP:
            correct_column_headers = [
                "CHROMOSOME",
                "START_POSITION",
                "REFERENCE_ALLELE",
                "TUMOR_SAMPLE_BARCODE",
                "TUMOR_SEQ_ALLELE2",
            ]
            # T_REF_COUNT + T_ALT_COUNT = T_DEPTH
        else:
            correct_column_headers = [
                "CHROMOSOME",
                "START_POSITION",
                "REFERENCE_ALLELE",
                "TUMOR_SAMPLE_BARCODE",
                "T_ALT_COUNT",
                "TUMOR_SEQ_ALLELE2",
            ]
            # T_REF_COUNT + T_ALT_COUNT = T_DEPTH
        optional_headers = ["T_REF_COUNT", "N_DEPTH", "N_REF_COUNT", "N_ALT_COUNT"]

        mutationDF.columns = [col.upper() for col in mutationDF.columns]

        # total_error = ""
        total_error = StringIO()
        warning = StringIO()

        # CHECK: Everything in correct_column_headers must be in mutation file
        if not all(
            [
                process_functions.checkColExist(mutationDF, i)
                for i in correct_column_headers
            ]
        ):
            total_error.write(
                "maf: Must at least have these headers: {}. "
                "If you are writing your maf file with R, please make"
                "sure to specify the 'quote=FALSE' parameter.\n".format(
                    ",".join(
                        [
                            i
                            for i in correct_column_headers
                            if i not in mutationDF.columns.values
                        ]
                    )
                )
            )
        else:
            # CHECK: First column must be in the first_header list
            if mutationDF.columns[0] not in first_header:
                total_error.write(
                    "maf: First column header must be "
                    "one of these: {}.\n".format(", ".join(first_header))
                )
            # No duplicated values
            primary_cols = [
                "CHROMOSOME",
                "START_POSITION",
                "REFERENCE_ALLELE",
                "TUMOR_SAMPLE_BARCODE",
                "TUMOR_SEQ_ALLELE2",
            ]
            # Strip white space if string column
            for col in primary_cols:
                if mutationDF[col].dtype == object:
                    mutationDF[col] = mutationDF[col].str.strip()
            duplicated_idx = mutationDF.duplicated(primary_cols)
            # Find samples with duplicated variants
            duplicated_variants = (
                mutationDF["TUMOR_SAMPLE_BARCODE"][duplicated_idx]
                .unique()
                .astype(str)
                .tolist()
            )

            if duplicated_idx.any():
                total_error.write(
                    "maf: Must not have duplicated variants. "
                    "Samples with duplicated variants: "
                    f"{', '.join(duplicated_variants)}\n"
                )

        t_depth_exists = process_functions.checkColExist(mutationDF, "T_DEPTH")
        t_ref_exists = process_functions.checkColExist(mutationDF, "T_REF_COUNT")
        if not t_depth_exists and not t_ref_exists and not SP:
            total_error.write("maf: If missing T_DEPTH, must have T_REF_COUNT!\n")
        numerical_cols = [
            "T_DEPTH",
            "T_ALT_COUNT",
            "T_REF_COUNT",
            "N_DEPTH",
            "N_REF_COUNT",
            "N_ALT_COUNT",
        ]
        for col in numerical_cols:
            col_exists = process_functions.checkColExist(mutationDF, col)
            if col_exists:
                # Since NA is an allowed value, when reading in the dataframe
                # the 'NA' string is not converted.  This will convert all
                # 'NA' values in the numerical columns into actual float('nan')
                mutationDF.loc[mutationDF[col] == "NA", col] = float("nan")
                # Attempt to convert column to float
                try:
                    mutationDF[col] = mutationDF[col].astype(float)
                except ValueError:
                    pass
                if mutationDF[col].dtype not in [int, float]:
                    total_error.write(f"maf: {col} must be a numerical column.\n")

        # CHECK: Must have TUMOR_SEQ_ALLELE2
        error, warn = _check_allele_col(mutationDF, "TUMOR_SEQ_ALLELE2")
        total_error.write(error)
        warning.write(warn)

        # CHECK: Mutation file would benefit from columns in optional_headers
        if (
            not all(
                [
                    process_functions.checkColExist(mutationDF, i)
                    for i in optional_headers
                ]
            )
            and not SP
        ):
            warning.write(
                "maf: Does not have the column headers that can give extra "
                "information to the processed maf: {}.\n".format(
                    ", ".join(
                        [
                            i
                            for i in optional_headers
                            if i not in mutationDF.columns.values
                        ]
                    )
                )
            )

        # CHECK: Must have REFERENCE_ALLELE
        error, warn = _check_allele_col(mutationDF, "REFERENCE_ALLELE")
        total_error.write(error)
        warning.write(warn)

        if process_functions.checkColExist(mutationDF, "CHROMOSOME"):
            # CHECK: Chromosome column can't have any values that start
            # with chr or have any WT values
            invalid_values = [
                str(i).startswith("chr") or str(i) == "WT"
                for i in mutationDF["CHROMOSOME"]
            ]
            if sum(invalid_values) > 0:
                total_error.write(
                    "maf: CHROMOSOME column cannot have any values that "
                    "start with 'chr' or any 'WT' values.\n"
                )

        error = _check_tsa1_tsa2(mutationDF)
        total_error.write(error)

        return total_error.getvalue(), warning.getvalue()

Example #17

Show file

    def _validate(self, cnvDF, nosymbol_check, project_id):
        total_error = ""
        warning = ""
        cnvDF.columns = [col.upper() for col in cnvDF.columns]

        if cnvDF.columns[0] != "HUGO_SYMBOL":
            total_error += "Your cnv file's first column must be Hugo_Symbol\n"
        haveColumn = process_functions.checkColExist(cnvDF, "HUGO_SYMBOL")
        if haveColumn:
            keepSymbols = cnvDF["HUGO_SYMBOL"]
            cnvDF.drop("HUGO_SYMBOL", axis=1, inplace=True)

        # if sum(cnvDF.apply(lambda x: sum(x.isnull()))) > 0:
        #   total_error += "Your cnv file must not have any empty values\n"

        if process_functions.checkColExist(cnvDF, "ENTREZ_GENE_ID"):
            del cnvDF["ENTREZ_GENE_ID"]

        # cnvDF = cnvDF.fillna('')
        allowed_values = [
            "-2.0",
            "-2",
            "-1.5",
            "-1.0",
            "-1",
            "0.0",
            "0",
            "0.5",
            "1.0",
            "1",
            "1.5",
            "2",
            "2.0",
            "nan",
        ]
        if not all(cnvDF.applymap(lambda x: str(x) in allowed_values).all()):
            total_error += ("All values must be NA/blank, -2, -1.5, -1, -0.5, "
                            "0, 0.5, 1, 1.5, or 2.\n")
        else:
            cnvDF["HUGO_SYMBOL"] = keepSymbols
            if haveColumn and not nosymbol_check:
                databaseToSynIdMappingDf = (
                    process_functions.get_synid_database_mappingdf(
                        self.syn, project_id))
                bedSynId = process_functions.getDatabaseSynId(
                    self.syn,
                    "bed",
                    databaseToSynIdMappingDf=databaseToSynIdMappingDf)
                bed = self.syn.tableQuery(
                    "select Hugo_Symbol, ID from {} where "
                    "CENTER = '{}'".format(bedSynId, self.center))
                bedDf = bed.asDataFrame()
                cnvDF["remapped"] = cnvDF["HUGO_SYMBOL"].apply(
                    lambda x: validateSymbol(x, bedDf))
                cnvDF = cnvDF[~cnvDF["remapped"].isnull()]

                # Do not allow any duplicated genes after symbols
                # have been remapped
                if sum(cnvDF["remapped"].duplicated()) > 0:
                    duplicated = cnvDF["remapped"].duplicated(keep=False)
                    total_error += (
                        "Your CNA file has duplicated Hugo_Symbols "
                        "(After remapping of genes): {} -> {}.\n".format(
                            ",".join(cnvDF["HUGO_SYMBOL"][duplicated]),
                            ",".join(cnvDF["remapped"][duplicated]),
                        ))
        return (total_error, warning)

Example #18

Show file

File: clinical.py Project: Sage-Bionetworks/Genie

    def _validate(self, clinicaldf, oncotree_link):
        """
        This function validates the clinical file to make sure it adhere
        to the clinical SOP.

        Args:
            clinicalDF: Merged clinical file with patient and sample
                        information
            oncotree_link: Link to oncotree

        Returns:
            Error message
        """
        total_error = StringIO()
        warning = StringIO()

        clinicaldf.columns = [col.upper() for col in clinicaldf.columns]
        # CHECK: for empty rows
        empty_rows = clinicaldf.isnull().values.all(axis=1)
        if empty_rows.any():
            total_error.write("Clinical file(s): No empty rows allowed.\n")
            # Remove completely empty rows to speed up processing
            clinicaldf = clinicaldf[~empty_rows]

        clinicaldf = clinicaldf.fillna("")

        oncotree_mapping_dict = process_functions.get_oncotree_code_mappings(
            oncotree_link
        )
        oncotree_mapping = pd.DataFrame(
            {"ONCOTREE_CODE": list(oncotree_mapping_dict.keys())}
        )

        sampletype_mapping = process_functions.getGenieMapping(self.syn, "syn7434273")

        ethnicity_mapping = process_functions.getGenieMapping(self.syn, "syn7434242")

        race_mapping = process_functions.getGenieMapping(self.syn, "syn7434236")

        sex_mapping = process_functions.getGenieMapping(self.syn, "syn7434222")

        # CHECK: SAMPLE_ID
        sample_id = "SAMPLE_ID"
        haveSampleColumn = process_functions.checkColExist(clinicaldf, sample_id)

        if not haveSampleColumn:
            total_error.write("Sample Clinical File: Must have SAMPLE_ID column.\n")
        else:
            if sum(clinicaldf[sample_id].duplicated()) > 0:
                total_error.write(
                    "Sample Clinical File: No duplicated SAMPLE_ID "
                    "allowed.\nIf there are no duplicated "
                    "SAMPLE_IDs, and both sample and patient files are "
                    "uploaded, then please check to make sure no duplicated "
                    "PATIENT_IDs exist in the patient clinical file.\n"
                )
        # CHECK: PATIENT_ID
        patientId = "PATIENT_ID"
        # #CHECK: PATIENT_ID IN SAMPLE FILE
        havePatientColumn = process_functions.checkColExist(clinicaldf, patientId)

        if not havePatientColumn:
            total_error.write("Patient Clinical File: Must have PATIENT_ID column.\n")

        # CHECK: within the sample file that the sample ids match
        # the patient ids
        if haveSampleColumn and havePatientColumn:
            # Make sure sample and patient ids are string cols
            clinicaldf[sample_id] = clinicaldf[sample_id].astype(str)
            clinicaldf[patientId] = clinicaldf[patientId].astype(str)
            if not all(
                [
                    patient in sample
                    for sample, patient in zip(
                        clinicaldf[sample_id], clinicaldf[patientId]
                    )
                ]
            ):

                total_error.write(
                    "Sample Clinical File: PATIENT_ID's much be contained in "
                    "the SAMPLE_ID's (ex. SAGE-1 <-> SAGE-1-2)\n"
                )
            # #CHECK: All samples must have associated patient data
            # (GENIE requires patient data)
            if not all(clinicaldf[patientId] != ""):
                total_error.write(
                    "Patient Clinical File: All samples must have associated "
                    "patient information and no null patient ids allowed. "
                    "These samples are missing patient data: {}\n".format(
                        ", ".join(
                            clinicaldf[sample_id][clinicaldf[patientId] == ""].unique()
                        )
                    )
                )

            # CHECK: All patients should have associated sample data
            if not all(clinicaldf[sample_id] != ""):
                # ## MAKE WARNING FOR NOW###
                warning.write(
                    "Sample Clinical File: All patients must have associated "
                    "sample information. These patients are missing sample "
                    "data: {}\n".format(
                        ", ".join(
                            clinicaldf[patientId][clinicaldf[sample_id] == ""].unique()
                        )
                    )
                )

        # CHECK: AGE_AT_SEQ_REPORT
        age = "AGE_AT_SEQ_REPORT"
        haveColumn = process_functions.checkColExist(clinicaldf, age)
        if haveColumn:
            # Deal with HIPAA converted rows from DFCI
            # First for loop can't int(text) because there
            # are instances that have <3435
            age_seq_report_df = clinicaldf[
                ~clinicaldf[age].isin(["Unknown", ">32485", "<6570"])
            ]

            # age_seq_report_df[age] = \
            #     remove_greaterthan_lessthan_str(age_seq_report_df[age])

            if not all([process_functions.checkInt(i) for i in age_seq_report_df[age]]):
                total_error.write(
                    "Sample Clinical File: Please double check your "
                    "AGE_AT_SEQ_REPORT. It must be an integer, 'Unknown', "
                    "'>32485', '<6570'.\n"
                )
            else:
                age_seq_report_df[age] = age_seq_report_df[age].astype(int)
                median_age = age_seq_report_df[age].median()
                if median_age < 100:
                    total_error.write(
                        "Sample Clinical File: Please double check your "
                        "AGE_AT_SEQ_REPORT. You may be reporting this value "
                        "in YEARS, please report in DAYS.\n"
                    )
        else:
            total_error.write(
                "Sample Clinical File: Must have AGE_AT_SEQ_REPORT column.\n"
            )

        # CHECK: ONCOTREE_CODE
        haveColumn = process_functions.checkColExist(clinicaldf, "ONCOTREE_CODE")
        maleOncoCodes = ["TESTIS", "PROSTATE", "PENIS"]
        womenOncoCodes = ["CERVIX", "VULVA", "UTERUS", "OVARY"]
        if haveColumn:
            # Make oncotree codes uppercase (SpCC/SPCC)
            clinicaldf["ONCOTREE_CODE"] = (
                clinicaldf["ONCOTREE_CODE"].astype(str).str.upper()
            )

            oncotree_codes = clinicaldf["ONCOTREE_CODE"][
                clinicaldf["ONCOTREE_CODE"] != "UNKNOWN"
            ]

            if not all(oncotree_codes.isin(oncotree_mapping["ONCOTREE_CODE"])):
                unmapped_oncotrees = oncotree_codes[
                    ~oncotree_codes.isin(oncotree_mapping["ONCOTREE_CODE"])
                ]
                total_error.write(
                    "Sample Clinical File: Please double check that all your "
                    "ONCOTREE CODES exist in the mapping. You have {} samples "
                    "that don't map. These are the codes that "
                    "don't map: {}\n".format(
                        len(unmapped_oncotrees),
                        ",".join(set(unmapped_oncotrees)),
                    )
                )
            # Should add the SEX mismatch into the dashboard file
            if (
                process_functions.checkColExist(clinicaldf, "SEX")
                and "oncotree_mapping_dict" in locals()
                and havePatientColumn
                and haveSampleColumn
            ):

                wrongCodeSamples = []
                # This is to check if oncotree codes match the sex,
                # returns list of samples that have conflicting codes and sex
                for code, patient, sample in zip(
                    clinicaldf["ONCOTREE_CODE"],
                    clinicaldf["PATIENT_ID"],
                    clinicaldf["SAMPLE_ID"],
                ):

                    if (
                        oncotree_mapping_dict.get(code) is not None
                        and sum(clinicaldf["PATIENT_ID"] == patient) > 0
                    ):

                        primaryCode = oncotree_mapping_dict[code][
                            "ONCOTREE_PRIMARY_NODE"
                        ]

                        sex = clinicaldf["SEX"][
                            clinicaldf["PATIENT_ID"] == patient
                        ].values[0]
                        sex = float("nan") if sex == "" else float(sex)
                        if (
                            oncotree_mapping_dict[code]["ONCOTREE_PRIMARY_NODE"]
                            in maleOncoCodes
                            and sex != 1.0
                        ):

                            wrongCodeSamples.append(sample)
                        if (
                            oncotree_mapping_dict[code]["ONCOTREE_PRIMARY_NODE"]
                            in womenOncoCodes
                            and sex != 2.0
                        ):

                            wrongCodeSamples.append(sample)
                if len(wrongCodeSamples) > 0:
                    warning.write(
                        "Sample Clinical File: Some SAMPLE_IDs have "
                        "conflicting SEX and ONCOTREE_CODES: {}\n".format(
                            ",".join(wrongCodeSamples)
                        )
                    )
        else:
            total_error.write("Sample Clinical File: Must have ONCOTREE_CODE column.\n")

        warn, error = process_functions.check_col_and_values(
            clinicaldf,
            "SAMPLE_TYPE",
            sampletype_mapping["CODE"].tolist(),
            "Sample Clinical File",
            required=True,
        )
        total_error.write(error)

        # CHECK: SEQ_ASSAY_ID
        haveColumn = process_functions.checkColExist(clinicaldf, "SEQ_ASSAY_ID")
        if haveColumn:
            if not all([i != "" for i in clinicaldf["SEQ_ASSAY_ID"]]):
                total_error.write(
                    "Sample Clinical File: Please double check your "
                    "SEQ_ASSAY_ID columns, there are empty rows.\n"
                )
            # must remove empty seq assay ids first
            # Checking if seq assay ids start with the center name
            empty_seq_idx = clinicaldf.SEQ_ASSAY_ID != ""
            seqassay_ids = clinicaldf.SEQ_ASSAY_ID[empty_seq_idx]
            uniq_seqassay_ids = seqassay_ids.unique()
            invalid_seqassay = []
            for seqassay in uniq_seqassay_ids:
                # SEQ Ids are all capitalized now, so no need to check
                # for differences in case
                if not seqassay.upper().startswith(self.center):
                    invalid_seqassay.append(seqassay)
            if invalid_seqassay:
                total_error.write(
                    "Sample Clinical File: Please make sure your "
                    "SEQ_ASSAY_IDs start with your center "
                    "abbreviation: {}.\n".format(", ".join(invalid_seqassay))
                )
        else:
            total_error.write("Sample Clinical File: Must have SEQ_ASSAY_ID column.\n")

        haveColumn = process_functions.checkColExist(clinicaldf, "SEQ_DATE")
        seq_date_error = (
            "Sample Clinical File: SEQ_DATE must be one of five values- "
            "For Jan-March: use Jan-YEAR. "
            "For Apr-June: use Apr-YEAR. "
            "For July-Sep: use Jul-YEAR. "
            "For Oct-Dec: use Oct-YEAR. (ie. Apr-2017) "
            "For values that don't have SEQ_DATES that "
            "you want released use 'release'.\n"
        )

        if haveColumn:
            clinicaldf["SEQ_DATE"] = [
                i.title() for i in clinicaldf["SEQ_DATE"].astype(str)
            ]

            seqdate = clinicaldf["SEQ_DATE"][clinicaldf["SEQ_DATE"] != "Release"]
            if sum(clinicaldf["SEQ_DATE"] == "") > 0:
                total_error.write(
                    "Sample Clinical File: Samples without SEQ_DATEs will "
                    "NOT be released.\n"
                )
            try:
                if not seqdate.empty:
                    seqdate.apply(
                        lambda date: datetime.datetime.strptime(date, "%b-%Y")
                    )
                    if not seqdate.str.startswith(("Jan", "Apr", "Jul", "Oct")).all():
                        total_error.write(seq_date_error)
            except ValueError:
                total_error.write(seq_date_error)
        else:
            total_error.write("Sample Clinical File: Must have SEQ_DATE column.\n")

        # CHECK: BIRTH_YEAR
        error = _check_year(
            clinicaldf=clinicaldf,
            year_col="BIRTH_YEAR",
            filename="Patient Clinical File",
            allowed_string_values=["Unknown", ">89", "<18"],
        )
        total_error.write(error)

        # CHECK: YEAR DEATH
        error = _check_year(
            clinicaldf=clinicaldf,
            year_col="YEAR_DEATH",
            filename="Patient Clinical File",
            allowed_string_values=[
                "Unknown",
                "Not Collected",
                "Not Applicable",
                "Not Released",
                ">89",
                "<18",
            ],
        )
        total_error.write(error)

        # CHECK: YEAR CONTACT
        error = _check_year(
            clinicaldf=clinicaldf,
            year_col="YEAR_CONTACT",
            filename="Patient Clinical File",
            allowed_string_values=[
                "Unknown",
                "Not Collected",
                "Not Released",
                ">89",
                "<18",
            ],
        )
        total_error.write(error)

        # CHECK: INT CONTACT
        haveColumn = process_functions.checkColExist(clinicaldf, "INT_CONTACT")
        if haveColumn:
            if not all(
                [
                    process_functions.checkInt(i)
                    for i in clinicaldf.INT_CONTACT
                    if i
                    not in [
                        ">32485",
                        "<6570",
                        "Unknown",
                        "Not Collected",
                        "Not Released",
                    ]
                ]
            ):

                total_error.write(
                    "Patient Clinical File: Please double check your "
                    "INT_CONTACT column, it must be an integer, '>32485', "
                    "'<6570', 'Unknown', 'Not Released' or 'Not Collected'.\n"
                )
        else:
            total_error.write("Patient Clinical File: Must have INT_CONTACT column.\n")

        # INT DOD
        haveColumn = process_functions.checkColExist(clinicaldf, "INT_DOD")
        if haveColumn:
            if not all(
                [
                    process_functions.checkInt(i)
                    for i in clinicaldf.INT_DOD
                    if i
                    not in [
                        ">32485",
                        "<6570",
                        "Unknown",
                        "Not Collected",
                        "Not Applicable",
                        "Not Released",
                    ]
                ]
            ):

                total_error.write(
                    "Patient Clinical File: Please double check your INT_DOD "
                    "column, it must be an integer, '>32485', '<6570', "
                    "'Unknown', 'Not Collected', 'Not Released' or "
                    "'Not Applicable'.\n"
                )
        else:
            total_error.write("Patient Clinical File: Must have INT_DOD column.\n")

        haveColumn = process_functions.checkColExist(clinicaldf, "DEAD")
        if haveColumn:
            # Need to have check_bool function
            if not all(
                [
                    str(i).upper() in ["TRUE", "FALSE"]
                    for i in clinicaldf.DEAD
                    if i not in ["Unknown", "Not Collected", "Not Released"]
                ]
            ):
                total_error.write(
                    "Patient Clinical File: Please double check your "
                    "DEAD column, it must be True, False, 'Unknown', "
                    "'Not Released' or 'Not Collected'.\n"
                )
        else:
            total_error.write("Patient Clinical File: Must have DEAD column.\n")
        # CHECK: contact vital status value consistency
        contact_error = _check_int_year_consistency(
            clinicaldf=clinicaldf,
            cols=["YEAR_CONTACT", "INT_CONTACT"],
            string_vals=["Not Collected", "Unknown", "Not Released"],
        )
        total_error.write(contact_error)

        # CHECK: death vital status value consistency
        death_error = _check_int_year_consistency(
            clinicaldf=clinicaldf,
            cols=["YEAR_DEATH", "INT_DOD"],
            string_vals=[
                "Not Collected",
                "Unknown",
                "Not Applicable",
                "Not Released",
            ],
        )
        total_error.write(death_error)
        death_error = _check_int_dead_consistency(clinicaldf=clinicaldf)
        total_error.write(death_error)

        # CHECK: SAMPLE_CLASS is optional attribute
        have_column = process_functions.checkColExist(clinicaldf, "SAMPLE_CLASS")
        if have_column:
            sample_class_vals = pd.Series(clinicaldf["SAMPLE_CLASS"].unique().tolist())
            if not sample_class_vals.isin(["Tumor", "cfDNA"]).all():
                total_error.write(
                    "Sample Clinical File: SAMPLE_CLASS column must "
                    "be 'Tumor', or 'cfDNA'\n"
                )

        # CHECK: PRIMARY_RACE
        warn, error = process_functions.check_col_and_values(
            clinicaldf,
            "PRIMARY_RACE",
            race_mapping["CODE"].tolist(),
            "Patient Clinical File",
        )
        warning.write(warn)
        total_error.write(error)

        # CHECK: SECONDARY_RACE
        warn, error = process_functions.check_col_and_values(
            clinicaldf,
            "SECONDARY_RACE",
            race_mapping["CODE"].tolist(),
            "Patient Clinical File",
        )
        warning.write(warn)
        total_error.write(error)

        # CHECK: TERTIARY_RACE
        warn, error = process_functions.check_col_and_values(
            clinicaldf,
            "TERTIARY_RACE",
            race_mapping["CODE"].tolist(),
            "Patient Clinical File",
        )
        warning.write(warn)
        total_error.write(error)

        # CHECK: SEX
        warn, error = process_functions.check_col_and_values(
            clinicaldf,
            "SEX",
            sex_mapping["CODE"].tolist(),
            "Patient Clinical File",
            required=True,
        )
        warning.write(warn)
        total_error.write(error)

        # CHECK: ETHNICITY
        warn, error = process_functions.check_col_and_values(
            clinicaldf,
            "ETHNICITY",
            ethnicity_mapping["CODE"].tolist(),
            "Patient Clinical File",
        )
        warning.write(warn)
        total_error.write(error)

        return total_error.getvalue(), warning.getvalue()

Example #19

Show file

    def _validate(self, assay_info_df, project_id):
        """
        Validates the values of assay information file

        Args:
            assay_info_df: assay information dataframe

        Returns:
            tuple: error and warning
        """

        total_error = ""
        warning = ""

        if process_functions.checkColExist(assay_info_df, "SEQ_ASSAY_ID"):
            all_seq_assays = (assay_info_df.SEQ_ASSAY_ID.replace(
                {
                    "_": "-"
                }, regex=True).str.upper().unique())
            if not all(
                [assay.startswith(self.center) for assay in all_seq_assays]):
                total_error += (
                    "Assay_information.yaml: Please make sure all your "
                    "SEQ_ASSAY_IDs start with your center abbreviation.\n")
            db_to_syn_map_df = process_functions.get_synid_database_mappingdf(
                self.syn, project_id)
            sample_synid = process_functions.getDatabaseSynId(
                self.syn, "sample", databaseToSynIdMappingDf=db_to_syn_map_df)
            uniq_seq_df = process_functions.get_syntabledf(
                self.syn,
                f"select distinct(SEQ_ASSAY_ID) as seq from {sample_synid} "
                f"where CENTER = '{self.center}'",
            )
            # These are all the SEQ_ASSAY_IDs that are in the clinical database
            # but not in the assay_information file
            missing_seqs = uniq_seq_df["seq"][
                ~uniq_seq_df["seq"].replace({
                    "_": "-"
                }, regex=True).str.upper().isin(all_seq_assays)]
            missing_seqs_str = ", ".join(missing_seqs)
            if missing_seqs.to_list():
                total_error += (
                    "Assay_information.yaml: You are missing SEQ_ASSAY_IDs: "
                    f"{missing_seqs_str}\n")

        else:
            total_error += "Assay_information.yaml: Must have SEQ_ASSAY_ID column.\n"

        read_group_dict = process_functions.get_gdc_data_dictionary(
            "read_group")
        read_group_headers = read_group_dict["properties"]
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "is_paired_end",
            [True, False],
            filename="Assay_information.yaml",
            required=True,
        )
        warning += warn
        total_error += error

        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "library_selection",
            read_group_headers["library_selection"]["enum"],
            filename="Assay_information.yaml",
            required=True,
        )
        warning += warn
        total_error += error

        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "library_strategy",
            read_group_headers["library_strategy"]["enum"],
            filename="Assay_information.yaml",
            required=True,
        )
        warning += warn
        total_error += error

        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "platform",
            read_group_headers["platform"]["enum"],
            filename="Assay_information.yaml",
            required=True,
        )
        warning += warn
        total_error += error

        instrument_model = read_group_headers["instrument_model"]["enum"]
        instrument_model.extend(["Illumina NovaSeq 6000", None])
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "instrument_model",
            instrument_model,
            filename="Assay_information.yaml",
            required=True,
        )
        warning += warn
        total_error += error

        # target_capture_kit = read_group_headers['target_capture_kit']['enum']
        # warn, error = process_functions.check_col_and_values(
        #     assay_info_df,
        #     'target_capture_kit',
        #     target_capture_kit,
        #     filename="Assay_information.yaml",
        #     required=True)
        # warning += warn
        # total_error += error

        if not process_functions.checkColExist(assay_info_df,
                                               "target_capture_kit"):
            total_error += ("Assay_information.yaml: "
                            "Must have target_capture_kit column.\n")

        variant_classes = [
            "Splice_Site",
            "Nonsense_Mutation",
            "Frame_Shift_Del",
            "Frame_Shift_Ins",
            "Nonstop_Mutation",
            "Translation_Start_Site",
            "In_Frame_Ins",
            "In_Frame_Del",
            "Missense_Mutation",
            "Intron",
            "Splice_Region",
            "Silent",
            "RNA",
            "5'UTR",
            "3'UTR",
            "IGR",
            "5'Flank",
            "3'Flank",
            None,
        ]
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "variant_classifications",
            variant_classes,
            filename="Assay_information.yaml",
            na_allowed=True,
            sep=";",
        )
        warning += warn
        total_error += error

        if process_functions.checkColExist(assay_info_df, "read_length"):
            if not all([
                    process_functions.checkInt(i)
                    for i in assay_info_df["read_length"]
                    if i is not None and not pd.isnull(i)
            ]):
                total_error += ("Assay_information.yaml: "
                                "Please double check your read_length.  "
                                "It must be an integer or null.\n")
        else:
            total_error += "Assay_information.yaml: " "Must have read_length column.\n"

        if process_functions.checkColExist(assay_info_df, "number_of_genes"):
            if not all([
                    process_functions.checkInt(i)
                    for i in assay_info_df["number_of_genes"]
            ]):
                total_error += ("Assay_information.yaml: "
                                "Please double check your number_of_genes. "
                                "It must be an integer.\n")
        else:
            total_error += ("Assay_information.yaml: "
                            "Must have number_of_genes column.\n")

        if process_functions.checkColExist(assay_info_df, "gene_padding"):
            if not all([
                    process_functions.checkInt(i)
                    for i in assay_info_df["gene_padding"]
                    if i is not None and not pd.isnull(i)
            ]):
                total_error += ("Assay_information.yaml: "
                                "Please double check your gene_padding. "
                                "It must be an integer or blank.\n")
        else:
            warning += ("Assay_information.yaml: "
                        "gene_padding is by default 10 if not specified.\n")

        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "calling_strategy",
            ["tumor_only", "tumor_normal", "plasma_normal"],
            filename="Assay_information.yaml",
            required=True,
        )
        warning += warn
        total_error += error

        if process_functions.checkColExist(assay_info_df,
                                           "specimen_tumor_cellularity"):
            if not all([
                    i.startswith(">") and i.endswith("%")
                    for i in assay_info_df["specimen_tumor_cellularity"]
            ]):
                total_error += (
                    "Assay_information.yaml: "
                    "Please double check your specimen_tumor_cellularity. "
                    "It must in this format >(num)%. ie. >10%\n")
        else:
            total_error += ("Assay_information.yaml: "
                            "Must have specimen_tumor_cellularity column.\n")

        alteration_types = [
            "snv",
            "small_indels",
            "gene_level_cna",
            "intragenic_cna",
            "structural_variants",
        ]
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "alteration_types",
            alteration_types,
            filename="Assay_information.yaml",
            required=True,
            sep=";",
        )
        warning += warn
        total_error += error

        preservation_technique = ["FFPE", "fresh_frozen", "NA"]
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "preservation_technique",
            preservation_technique,
            filename="Assay_information.yaml",
            required=True,
            sep=";",
        )
        warning += warn
        total_error += error

        coverage = ["hotspot_regions", "coding_exons", "introns", "promoters"]
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "coverage",
            coverage,
            filename="Assay_information.yaml",
            required=True,
            sep=";",
        )
        warning += warn
        total_error += error

        return total_error, warning

Example #20

Show file

    def _validate(self, assay_info_df):
        '''
        Validates the values of assay information file

        Args:
            assay_info_df: assay information dataframe

        Returns:
            tuple: error and warning
        '''

        total_error = ""
        warning = ""

        if process_functions.checkColExist(assay_info_df, "SEQ_ASSAY_ID"):
            all_seq_assays = assay_info_df.SEQ_ASSAY_ID.unique()
            if not all(
                [assay.startswith(self.center) for assay in all_seq_assays]):
                total_error += \
                    "Assay_information.yaml: Please make sure your all your" +\
                    " SEQ_ASSAY_IDs start with your center abbreviation.\n"
        else:
            total_error += \
                "Assay_information.yaml: Must have SEQ_ASSAY_ID column.\n"

        read_group_dict = process_functions.get_gdc_data_dictionary(
            "read_group")
        read_group_headers = read_group_dict['properties']

        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            'is_paired_end', [True, False],
            filename="Assay_information.yaml",
            required=True)
        warning += warn
        total_error += error

        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            'library_selection',
            read_group_headers['library_selection']['enum'],
            filename="Assay_information.yaml",
            required=True)

        warning += warn
        total_error += error
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            'library_strategy',
            read_group_headers['library_strategy']['enum'],
            filename="Assay_information.yaml",
            required=True)

        warning += warn
        total_error += error
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            'platform',
            read_group_headers['platform']['enum'],
            filename="Assay_information.yaml",
            required=True)

        warning += warn
        total_error += error

        instrument_model = read_group_headers['instrument_model']['enum']
        instrument_model.append(None)
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            'instrument_model',
            instrument_model,
            filename="Assay_information.yaml",
            required=True)

        warning += warn
        total_error += error

        variant_classes = \
            ['Splice_Site', 'Nonsense_Mutation', 'Frame_Shift_Del',
             'Frame_Shift_Ins', 'Nonstop_Mutation', 'Translation_Start_Site',
             'In_Frame_Ins', 'In_Frame_Del', 'Missense_Mutation',
             'Intron', 'Splice_Region', 'Silent', 'RNA', "5'UTR", "3'UTR",
             'IGR', "5'Flank", "3'Flank", None]
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            'variant_classifications',
            variant_classes,
            filename="Assay_information.yaml",
            na_allowed=True)

        warning += warn
        total_error += error

        # if not process_functions.checkColExist(
        #         assay_info_df, "target_capture_kit"):
        #     total_error += ("Assay_information.yaml: "
        #                     "Must have target_capture_kit column.\n")

        if process_functions.checkColExist(assay_info_df, "read_length"):
            if not all([
                    process_functions.checkInt(i)
                    for i in assay_info_df["read_length"]
                    if i is not None and not pd.isnull(i)
            ]):
                total_error += \
                    ("Assay_information.yaml: "
                     "Please double check your read_length.  "
                     "It must be an integer or null.\n")
        else:
            total_error += \
                ("Assay_information.yaml: "
                 "Must have read_length column.\n")

        if process_functions.checkColExist(assay_info_df, "number_of_genes"):
            if not all([
                    process_functions.checkInt(i)
                    for i in assay_info_df["number_of_genes"]
            ]):
                total_error += \
                    ("Assay_information.yaml: "
                     "Please double check your number_of_genes. "
                     "It must be an integer.\n")
        else:
            total_error += \
                ("Assay_information.yaml: "
                 "Must have number_of_genes column.\n")

        if process_functions.checkColExist(assay_info_df, "gene_padding"):
            if not all([
                    process_functions.checkInt(i)
                    for i in assay_info_df["gene_padding"]
                    if i is not None and not pd.isnull(i)
            ]):
                total_error += \
                    ("Assay_information.yaml: "
                     "Please double check your gene_padding. "
                     "It must be an integer or blank.\n")
        else:
            warning += \
                ("Assay_information.yaml: "
                 "gene_padding is by default 10 if not specified.\n")

        return (total_error, warning)

Example #21

Show file

    def _validate(self, clinicalDF, oncotreeLink):
        """
        This function validates the clinical file to make sure it adhere
        to the clinical SOP.

        Args:
            clinicalDF: Merged clinical file with patient and sample
                        information
            oncotreeLink: Link to oncotree

        Returns:
            Error message
        """
        total_error = ""
        warning = ""

        clinicalDF.columns = [col.upper() for col in clinicalDF.columns]
        clinicalDF = clinicalDF.fillna("")

        # oncotree_mapping = process_functions.get_oncotree_codes(oncotreeLink)
        # if oncotree_mapping.empty:
        oncotree_mapping = pd.DataFrame()
        oncotree_mapping_dict = \
            process_functions.get_oncotree_code_mappings(oncotreeLink)
        oncotree_mapping['ONCOTREE_CODE'] = oncotree_mapping_dict.keys()

        sampleType_mapping = \
            process_functions.getGenieMapping(self.syn, "syn7434273")

        ethnicity_mapping = \
            process_functions.getGenieMapping(self.syn, "syn7434242")

        race_mapping = \
            process_functions.getGenieMapping(self.syn, "syn7434236")

        sex_mapping = \
            process_functions.getGenieMapping(self.syn, "syn7434222")

        # CHECK: SAMPLE_ID
        sampleId = 'SAMPLE_ID'
        haveSampleColumn = \
            process_functions.checkColExist(clinicalDF, sampleId)

        if not haveSampleColumn:
            total_error += \
                "Sample Clinical File: Must have SAMPLE_ID column.\n"
        else:
            if sum(clinicalDF[sampleId].duplicated()) > 0:
                total_error += (
                    "Sample Clinical File: No duplicated SAMPLE_ID "
                    "allowed.\nIf there are no duplicated "
                    "SAMPLE_IDs, and both sample and patient files are "
                    "uploaded, then please check to make sure no duplicated "
                    "PATIENT_IDs exist in the patient clinical file.\n")

        # CHECK: PATIENT_ID
        patientId = "PATIENT_ID"
        # #CHECK: PATIENT_ID IN SAMPLE FILE
        havePatientColumn = \
            process_functions.checkColExist(clinicalDF, patientId)

        if not havePatientColumn:
            total_error += \
                "Patient Clinical File: Must have PATIENT_ID column.\n"

        # CHECK: within the sample file that the sample ids match
        # the patient ids
        if haveSampleColumn and havePatientColumn:
            # Make sure sample and patient ids are string cols
            clinicalDF[sampleId] = clinicalDF[sampleId].astype(str)
            clinicalDF[patientId] = clinicalDF[patientId].astype(str)
            if not all([
                    patient in sample for sample, patient in zip(
                        clinicalDF[sampleId], clinicalDF[patientId])
            ]):

                total_error += (
                    "Sample Clinical File: PATIENT_ID's much be contained in "
                    "the SAMPLE_ID's (ex. SAGE-1 <-> SAGE-1-2)\n")
            # #CHECK: All samples must have associated patient data
            # (GENIE requires patient data)
            if not all(clinicalDF[patientId] != ""):
                total_error += (
                    "Patient Clinical File: All samples must have associated "
                    "patient information and no null patient ids allowed. "
                    "These samples are missing patient data: {}\n".format(
                        ", ".join(
                            clinicalDF[sampleId][clinicalDF[patientId] == ""]))
                )
            # CHECK: All patients should have associated sample data
            if not all(clinicalDF[sampleId] != ""):
                # ## MAKE WARNING FOR NOW###
                warning += (
                    "Sample Clinical File: All patients must have associated "
                    "sample information. These patients are missing sample "
                    "data: {}\n".format(", ".join(
                        clinicalDF[patientId][clinicalDF[sampleId] == ""])))

        # CHECK: AGE_AT_SEQ_REPORT
        age = "AGE_AT_SEQ_REPORT"
        haveColumn = process_functions.checkColExist(clinicalDF, age)
        if haveColumn:
            # Deal with HIPAA converted rows from DFCI
            # First for loop can't int(text) because there
            # are instances that have <3435
            age_seq_report_df = \
                clinicalDF[~clinicalDF[age].isin(['Unknown'])]

            age_seq_report_df[age] = \
                remove_greaterthan_lessthan_str(age_seq_report_df[age])

            if not all([
                    process_functions.checkInt(i)
                    for i in age_seq_report_df[age]
            ]):
                total_error += (
                    "Sample Clinical File: Please double check your "
                    "AGE_AT_SEQ_REPORT. It must be an integer or 'Unknown'.\n")
            else:
                age_seq_report_df[age] = age_seq_report_df[age].astype(int)
                median_age = pd.np.median(age_seq_report_df[age])
                if median_age < 100:
                    total_error += (
                        "Sample Clinical File: Please double check your "
                        "AGE_AT_SEQ_REPORT. You may be reporting this value "
                        "in YEARS, please report in DAYS.\n")
        else:
            total_error += \
                "Sample Clinical File: Must have AGE_AT_SEQ_REPORT column.\n"

        # CHECK: ONCOTREE_CODE
        haveColumn = \
            process_functions.checkColExist(clinicalDF, "ONCOTREE_CODE")
        maleOncoCodes = ["TESTIS", "PROSTATE", "PENIS"]
        womenOncoCodes = ["CERVIX", "VULVA", "UTERUS", "OVARY"]
        if haveColumn:
            # Make oncotree codes uppercase (SpCC/SPCC)
            clinicalDF['ONCOTREE_CODE'] = \
                clinicalDF['ONCOTREE_CODE'].astype(str).str.upper()

            oncotree_codes = clinicalDF['ONCOTREE_CODE'][
                clinicalDF['ONCOTREE_CODE'] != "UNKNOWN"]

            if not all(oncotree_codes.isin(oncotree_mapping['ONCOTREE_CODE'])):
                unmapped_oncotrees = oncotree_codes[
                    ~oncotree_codes.isin(oncotree_mapping['ONCOTREE_CODE'])]
                total_error += (
                    "Sample Clinical File: Please double check that all your "
                    "ONCOTREE CODES exist in the mapping. You have {} samples "
                    "that don't map. These are the codes that "
                    "don't map: {}\n".format(len(unmapped_oncotrees), ",".join(
                        set(unmapped_oncotrees))))

            if process_functions.checkColExist(clinicalDF, "SEX") and \
               'oncotree_mapping_dict' in locals() and \
               havePatientColumn and \
               haveSampleColumn:

                wrongCodeSamples = []
                # This is to check if oncotree codes match the sex,
                # returns list of samples that have conflicting codes and sex
                for code, patient, sample in zip(clinicalDF['ONCOTREE_CODE'],
                                                 clinicalDF['PATIENT_ID'],
                                                 clinicalDF['SAMPLE_ID']):

                    if oncotree_mapping_dict.get(code) is not None and \
                       sum(clinicalDF['PATIENT_ID'] == patient) > 0:

                        primaryCode = oncotree_mapping_dict[code][
                            'ONCOTREE_PRIMARY_NODE']

                        sex = clinicalDF['SEX'][clinicalDF['PATIENT_ID'] ==
                                                patient].values[0]
                        sex = float('nan') if sex == '' else float(sex)
                        if oncotree_mapping_dict[code][
                                'ONCOTREE_PRIMARY_NODE'] in maleOncoCodes and \
                           sex != 1.0:

                            wrongCodeSamples.append(sample)
                        if oncotree_mapping_dict[code][
                                'ONCOTREE_PRIMARY_NODE'] in womenOncoCodes and\
                           sex != 2.0:

                            wrongCodeSamples.append(sample)
                if len(wrongCodeSamples) > 0:
                    warning += (
                        "Sample Clinical File: Some SAMPLE_IDs have "
                        "conflicting SEX and ONCOTREE_CODES: {}\n".format(
                            ",".join(wrongCodeSamples)))
        else:
            total_error += \
                "Sample Clinical File: Must have ONCOTREE_CODE column.\n"

        warn, error = process_functions.check_col_and_values(
            clinicalDF,
            "SAMPLE_TYPE",
            sampleType_mapping['CODE'].tolist(),
            "Sample Clinical File",
            required=True)
        total_error += error

        # CHECK: SEQ_ASSAY_ID
        haveColumn = \
            process_functions.checkColExist(clinicalDF, "SEQ_ASSAY_ID")
        if haveColumn:
            if not all([i != "" for i in clinicalDF['SEQ_ASSAY_ID']]):
                total_error += (
                    "Sample Clinical File: Please double check your "
                    "SEQ_ASSAY_ID columns, there are empty rows.\n")
            # must remove empty seq assay ids first
            # Checking if seq assay ids start with the center name
            seqAssayIds = \
                clinicalDF.SEQ_ASSAY_ID[clinicalDF.SEQ_ASSAY_ID != ""]
            allSeqAssays = seqAssayIds.unique()
            notNormalized = []
            not_caps = []
            for seqassay in allSeqAssays:
                # SEQ Ids are all capitalized now, so no need to check
                # for differences in case
                if not seqassay.upper().startswith(self.center):
                    not_caps.append(seqassay)
            if len(not_caps) > 0:
                total_error += ("Sample Clinical File: Please make sure your "
                                "SEQ_ASSAY_IDs start with your center "
                                "abbreviation: {}.\n".format(
                                    ", ".join(not_caps)))
        else:
            total_error += \
                "Sample Clinical File: Must have SEQ_ASSAY_ID column.\n"

        haveColumn = process_functions.checkColExist(clinicalDF, "SEQ_DATE")
        seq_date_error = (
            "Sample Clinical File: SEQ_DATE must be one of five values- "
            "For Jan-March: use Jan-YEAR. "
            "For Apr-June: use Apr-YEAR. "
            "For July-Sep: use Jul-YEAR. "
            "For Oct-Dec: use Oct-YEAR. (ie. Apr-2017) "
            "For values that don't have SEQ_DATES that "
            "you want released use 'release'.\n")

        if haveColumn:
            clinicalDF['SEQ_DATE'] = [
                i.title() for i in clinicalDF['SEQ_DATE'].astype(str)
            ]

            seqDate = clinicalDF['SEQ_DATE'][
                clinicalDF['SEQ_DATE'] != 'Release']
            if sum(clinicalDF['SEQ_DATE'] == '') > 0:
                total_error += (
                    "Sample Clinical File: Samples without SEQ_DATEs will "
                    "NOT be released.\n")
            try:
                if not seqDate.empty:
                    dates = seqDate.apply(
                        lambda date: datetime.datetime.strptime(date, '%b-%Y'))
                    # REMOVE JUN LATER
                    if not all([
                            i.startswith(("Jul", "Jan", "Oct", "Apr"))
                            for i in seqDate
                    ]):
                        total_error += seq_date_error
            except ValueError:
                total_error += seq_date_error
        else:
            total_error += "Sample Clinical File: Must have SEQ_DATE column.\n"

        # CHECK: BIRTH_YEAR
        birth_year = "BIRTH_YEAR"
        haveColumn = process_functions.checkColExist(clinicalDF, birth_year)
        if haveColumn:
            birth_year_df = \
                clinicalDF[~clinicalDF[birth_year].isin(['Unknown'])]
            # Deal with HIPAA converted rows from DFCI
            # First for loop can't int(text) because there are
            # instances that have <YYYY
            birth_year_df[birth_year] = \
                remove_greaterthan_lessthan_str(birth_year_df[birth_year])

            try:
                years = birth_year_df[birth_year].apply(
                    lambda x: datetime.datetime.strptime(str(int(
                        x)), '%Y').year > datetime.datetime.utcnow().year)

                assert not years.any()
            except Exception:
                total_error += (
                    "Patient Clinical File: Please double check your "
                    "BIRTH_YEAR column, it must be an integer in YYYY format "
                    "> {year} or 'Unknown'.\n".format(
                        year=datetime.datetime.utcnow().year))
        else:
            total_error += \
                "Patient Clinical File: Must have BIRTH_YEAR column.\n"

        # CHECK: VITAL_STATUS
        # YEAR DEATH
        haveColumn = process_functions.checkColExist(clinicalDF, "YEAR_DEATH")
        if haveColumn:
            notNullYears = clinicalDF.YEAR_DEATH[~clinicalDF.YEAR_DEATH.isin(
                ['Unknown', 'Not Collected', 'Not Applicable'])]
            try:
                notNullYears.apply(
                    lambda x: datetime.datetime.strptime(str(int(x)), '%Y'))
            except Exception:
                total_error += (
                    "Patient Clinical File: Please double check your "
                    "YEAR_DEATH column, it must be an integer in YYYY format, "
                    "'Unknown', 'Not Applicable' or 'Not Collected'.\n")
        else:
            total_error += \
                "Patient Clinical File: Must have YEAR_DEATH column.\n"

        # YEAR CONTACT
        haveColumn = process_functions.checkColExist(clinicalDF,
                                                     "YEAR_CONTACT")
        if haveColumn:
            notNullYears = clinicalDF.YEAR_CONTACT[
                ~clinicalDF.YEAR_CONTACT.isin(['Unknown', 'Not Collected'])]
            try:
                notNullYears.apply(
                    lambda x: datetime.datetime.strptime(str(int(x)), '%Y'))
            except Exception:
                total_error += (
                    "Patient Clinical File: Please double check your "
                    "YEAR_CONTACT column, it must be an integer in YYYY "
                    "format, 'Unknown' or 'Not Collected'.\n")
        else:
            total_error += \
                "Patient Clinical File: Must have YEAR_CONTACT column.\n"

        # INT CONTACT
        haveColumn = process_functions.checkColExist(clinicalDF, "INT_CONTACT")
        if haveColumn:
            if not all([
                    process_functions.checkInt(i)
                    for i in clinicalDF.INT_CONTACT if i not in
                ['>32485', '<6570', 'Unknown', 'Not Collected']
            ]):

                total_error += (
                    "Patient Clinical File: Please double check your "
                    "INT_CONTACT column, it must be an integer, '>32485', "
                    "'<6570', 'Unknown' or 'Not Collected'.\n")
        else:
            total_error += \
                "Patient Clinical File: Must have INT_CONTACT column.\n"

        # INT DOD
        haveColumn = process_functions.checkColExist(clinicalDF, "INT_DOD")
        if haveColumn:
            if not all([
                    process_functions.checkInt(i)
                    for i in clinicalDF.INT_DOD if i not in [
                        '>32485', '<6570', 'Unknown', 'Not Collected',
                        'Not Applicable'
                    ]
            ]):

                total_error += (
                    "Patient Clinical File: Please double check your INT_DOD "
                    "column, it must be an integer, '>32485', '<6570', "
                    "'Unknown', 'Not Collected' or 'Not Applicable'.\n")
        else:
            total_error += \
                "Patient Clinical File: Must have INT_DOD column.\n"

        haveColumn = process_functions.checkColExist(clinicalDF, "DEAD")
        if haveColumn:
            # Need to have check_bool function
            if not all([
                    str(i).upper() in ['TRUE', 'FALSE']
                    for i in clinicalDF.DEAD
                    if i not in ['Unknown', 'Not Collected']
            ]):
                total_error += (
                    "Patient Clinical File: Please double check your "
                    "DEAD column, it must be True, False, 'Unknown' or "
                    "'Not Collected'.\n")
        else:
            total_error += \
                "Patient Clinical File: Must have DEAD column.\n"

        # CHECK: PRIMARY_RACE
        warn, error = process_functions.check_col_and_values(
            clinicalDF, "PRIMARY_RACE", race_mapping['CODE'].tolist(),
            "Patient Clinical File")
        warning += warn
        total_error += error

        # CHECK: SECONDARY_RACE
        warn, error = process_functions.check_col_and_values(
            clinicalDF, "SECONDARY_RACE", race_mapping['CODE'].tolist(),
            "Patient Clinical File")
        warning += warn
        total_error += error

        # CHECK: TERTIARY_RACE
        warn, error = process_functions.check_col_and_values(
            clinicalDF, "TERTIARY_RACE", race_mapping['CODE'].tolist(),
            "Patient Clinical File")
        warning += warn
        total_error += error

        # CHECK: SEX
        warn, error = process_functions.check_col_and_values(
            clinicalDF,
            "SEX",
            sex_mapping['CODE'].tolist(),
            "Patient Clinical File",
            required=True)
        warning += warn
        total_error += error

        # CHECK: ETHNICITY
        warn, error = process_functions.check_col_and_values(
            clinicalDF, "ETHNICITY", ethnicity_mapping['CODE'].tolist(),
            "Patient Clinical File")
        warning += warn
        total_error += error

        return (total_error, warning)