def _validate(self, mutationInCisDf, project_id):
        databaseToSynIdMappingDf = process_functions.get_synid_database_mappingdf(
            self.syn, project_id)
        mutationInCisSynId = process_functions.getDatabaseSynId(
            self.syn,
            "mutationsInCis",
            databaseToSynIdMappingDf=databaseToSynIdMappingDf,
        )
        # Pull down the correct database
        existingMergeCheck = self.syn.tableQuery(
            "select * from {} where Center = '{}'".format(
                mutationInCisSynId, self.center))
        existingMergeCheckDf = existingMergeCheck.asDataFrame()

        total_error = ""
        warning = ""
        required_headers = pd.Series([
            "Flag",
            "Center",
            "Tumor_Sample_Barcode",
            "Hugo_Symbol",
            "HGVSp_Short",
            "Variant_Classification",
            "Chromosome",
            "Start_Position",
            "Reference_Allele",
            "Tumor_Seq_Allele2",
            "t_alt_count_num",
            "t_depth",
        ])
        primaryKeys = [
            "Tumor_Sample_Barcode",
            "HGVSp_Short",
            "Start_Position",
            "Reference_Allele",
            "Tumor_Seq_Allele2",
        ]
        if not all(required_headers.isin(mutationInCisDf.columns)):
            missing_headers = required_headers[~required_headers.
                                               isin(mutationInCisDf.columns)]
            total_error += ("Mutations In Cis Filter File: "
                            "Must at least have these headers: %s.\n" %
                            ",".join(missing_headers))
        else:
            new = mutationInCisDf[primaryKeys].fillna("")
            existing = existingMergeCheckDf[primaryKeys].fillna("")

            existing["primaryAll"] = [
                " ".join(values.astype(str))
                for i, values in existing.iterrows()
            ]
            new["primaryAll"] = [
                " ".join(values.astype(str)) for i, values in new.iterrows()
            ]
            if not all(new.primaryAll.isin(existing.primaryAll)):
                total_error += ("Mutations In Cis Filter File: "
                                "All variants must come from the original "
                                "mutationInCis_filtered_samples.csv file in "
                                "each institution's staging folder.\n")
        return total_error, warning
def test_get_synid_database_mappingdf(test, staging, synid):
    '''
    Tests getting database mapping config
    no flags
    staging flag
    test flag
    '''
    arg = argparser()
    with patch.object(syn, "get", return_value=ENTITY), \
         patch.object(process_functions, "get_syntabledf",
                      return_value=arg.asDataFrame()) as patch_gettabledf:
        df = process_functions.get_synid_database_mappingdf(syn,
                                                            project_id=None)
        patch_gettabledf.assert_called_once_with(
            syn, "SELECT * FROM {}".format(ENTITY.dbMapping[0]))
        assert df.equals(arg.asDataFrame())
Beispiel #3
0
    def _validate(self, assay_info_df, project_id):
        """
        Validates the values of assay information file

        Args:
            assay_info_df: assay information dataframe

        Returns:
            tuple: error and warning
        """

        total_error = ""
        warning = ""

        if process_functions.checkColExist(assay_info_df, "SEQ_ASSAY_ID"):
            all_seq_assays = (assay_info_df.SEQ_ASSAY_ID.replace(
                {
                    "_": "-"
                }, regex=True).str.upper().unique())
            if not all(
                [assay.startswith(self.center) for assay in all_seq_assays]):
                total_error += (
                    "Assay_information.yaml: Please make sure all your "
                    "SEQ_ASSAY_IDs start with your center abbreviation.\n")
            db_to_syn_map_df = process_functions.get_synid_database_mappingdf(
                self.syn, project_id)
            sample_synid = process_functions.getDatabaseSynId(
                self.syn, "sample", databaseToSynIdMappingDf=db_to_syn_map_df)
            uniq_seq_df = process_functions.get_syntabledf(
                self.syn,
                f"select distinct(SEQ_ASSAY_ID) as seq from {sample_synid} "
                f"where CENTER = '{self.center}'",
            )
            # These are all the SEQ_ASSAY_IDs that are in the clinical database
            # but not in the assay_information file
            missing_seqs = uniq_seq_df["seq"][
                ~uniq_seq_df["seq"].replace({
                    "_": "-"
                }, regex=True).str.upper().isin(all_seq_assays)]
            missing_seqs_str = ", ".join(missing_seqs)
            if missing_seqs.to_list():
                total_error += (
                    "Assay_information.yaml: You are missing SEQ_ASSAY_IDs: "
                    f"{missing_seqs_str}\n")

        else:
            total_error += "Assay_information.yaml: Must have SEQ_ASSAY_ID column.\n"

        read_group_dict = process_functions.get_gdc_data_dictionary(
            "read_group")
        read_group_headers = read_group_dict["properties"]
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "is_paired_end",
            [True, False],
            filename="Assay_information.yaml",
            required=True,
        )
        warning += warn
        total_error += error

        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "library_selection",
            read_group_headers["library_selection"]["enum"],
            filename="Assay_information.yaml",
            required=True,
        )
        warning += warn
        total_error += error

        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "library_strategy",
            read_group_headers["library_strategy"]["enum"],
            filename="Assay_information.yaml",
            required=True,
        )
        warning += warn
        total_error += error

        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "platform",
            read_group_headers["platform"]["enum"],
            filename="Assay_information.yaml",
            required=True,
        )
        warning += warn
        total_error += error

        instrument_model = read_group_headers["instrument_model"]["enum"]
        instrument_model.extend(["Illumina NovaSeq 6000", None])
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "instrument_model",
            instrument_model,
            filename="Assay_information.yaml",
            required=True,
        )
        warning += warn
        total_error += error

        # target_capture_kit = read_group_headers['target_capture_kit']['enum']
        # warn, error = process_functions.check_col_and_values(
        #     assay_info_df,
        #     'target_capture_kit',
        #     target_capture_kit,
        #     filename="Assay_information.yaml",
        #     required=True)
        # warning += warn
        # total_error += error

        if not process_functions.checkColExist(assay_info_df,
                                               "target_capture_kit"):
            total_error += ("Assay_information.yaml: "
                            "Must have target_capture_kit column.\n")

        variant_classes = [
            "Splice_Site",
            "Nonsense_Mutation",
            "Frame_Shift_Del",
            "Frame_Shift_Ins",
            "Nonstop_Mutation",
            "Translation_Start_Site",
            "In_Frame_Ins",
            "In_Frame_Del",
            "Missense_Mutation",
            "Intron",
            "Splice_Region",
            "Silent",
            "RNA",
            "5'UTR",
            "3'UTR",
            "IGR",
            "5'Flank",
            "3'Flank",
            None,
        ]
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "variant_classifications",
            variant_classes,
            filename="Assay_information.yaml",
            na_allowed=True,
            sep=";",
        )
        warning += warn
        total_error += error

        if process_functions.checkColExist(assay_info_df, "read_length"):
            if not all([
                    process_functions.checkInt(i)
                    for i in assay_info_df["read_length"]
                    if i is not None and not pd.isnull(i)
            ]):
                total_error += ("Assay_information.yaml: "
                                "Please double check your read_length.  "
                                "It must be an integer or null.\n")
        else:
            total_error += "Assay_information.yaml: " "Must have read_length column.\n"

        if process_functions.checkColExist(assay_info_df, "number_of_genes"):
            if not all([
                    process_functions.checkInt(i)
                    for i in assay_info_df["number_of_genes"]
            ]):
                total_error += ("Assay_information.yaml: "
                                "Please double check your number_of_genes. "
                                "It must be an integer.\n")
        else:
            total_error += ("Assay_information.yaml: "
                            "Must have number_of_genes column.\n")

        if process_functions.checkColExist(assay_info_df, "gene_padding"):
            if not all([
                    process_functions.checkInt(i)
                    for i in assay_info_df["gene_padding"]
                    if i is not None and not pd.isnull(i)
            ]):
                total_error += ("Assay_information.yaml: "
                                "Please double check your gene_padding. "
                                "It must be an integer or blank.\n")
        else:
            warning += ("Assay_information.yaml: "
                        "gene_padding is by default 10 if not specified.\n")

        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "calling_strategy",
            ["tumor_only", "tumor_normal", "plasma_normal"],
            filename="Assay_information.yaml",
            required=True,
        )
        warning += warn
        total_error += error

        if process_functions.checkColExist(assay_info_df,
                                           "specimen_tumor_cellularity"):
            if not all([
                    i.startswith(">") and i.endswith("%")
                    for i in assay_info_df["specimen_tumor_cellularity"]
            ]):
                total_error += (
                    "Assay_information.yaml: "
                    "Please double check your specimen_tumor_cellularity. "
                    "It must in this format >(num)%. ie. >10%\n")
        else:
            total_error += ("Assay_information.yaml: "
                            "Must have specimen_tumor_cellularity column.\n")

        alteration_types = [
            "snv",
            "small_indels",
            "gene_level_cna",
            "intragenic_cna",
            "structural_variants",
        ]
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "alteration_types",
            alteration_types,
            filename="Assay_information.yaml",
            required=True,
            sep=";",
        )
        warning += warn
        total_error += error

        preservation_technique = ["FFPE", "fresh_frozen", "NA"]
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "preservation_technique",
            preservation_technique,
            filename="Assay_information.yaml",
            required=True,
            sep=";",
        )
        warning += warn
        total_error += error

        coverage = ["hotspot_regions", "coding_exons", "introns", "promoters"]
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "coverage",
            coverage,
            filename="Assay_information.yaml",
            required=True,
            sep=";",
        )
        warning += warn
        total_error += error

        return total_error, warning
Beispiel #4
0
    def _validate(self, cnvDF, nosymbol_check, project_id):
        total_error = ""
        warning = ""
        cnvDF.columns = [col.upper() for col in cnvDF.columns]

        if cnvDF.columns[0] != "HUGO_SYMBOL":
            total_error += "Your cnv file's first column must be Hugo_Symbol\n"
        haveColumn = process_functions.checkColExist(cnvDF, "HUGO_SYMBOL")
        if haveColumn:
            keepSymbols = cnvDF["HUGO_SYMBOL"]
            cnvDF.drop("HUGO_SYMBOL", axis=1, inplace=True)

        # if sum(cnvDF.apply(lambda x: sum(x.isnull()))) > 0:
        #   total_error += "Your cnv file must not have any empty values\n"

        if process_functions.checkColExist(cnvDF, "ENTREZ_GENE_ID"):
            del cnvDF["ENTREZ_GENE_ID"]

        # cnvDF = cnvDF.fillna('')
        allowed_values = [
            "-2.0",
            "-2",
            "-1.5",
            "-1.0",
            "-1",
            "0.0",
            "0",
            "0.5",
            "1.0",
            "1",
            "1.5",
            "2",
            "2.0",
            "nan",
        ]
        if not all(cnvDF.applymap(lambda x: str(x) in allowed_values).all()):
            total_error += ("All values must be NA/blank, -2, -1.5, -1, -0.5, "
                            "0, 0.5, 1, 1.5, or 2.\n")
        else:
            cnvDF["HUGO_SYMBOL"] = keepSymbols
            if haveColumn and not nosymbol_check:
                databaseToSynIdMappingDf = (
                    process_functions.get_synid_database_mappingdf(
                        self.syn, project_id))
                bedSynId = process_functions.getDatabaseSynId(
                    self.syn,
                    "bed",
                    databaseToSynIdMappingDf=databaseToSynIdMappingDf)
                bed = self.syn.tableQuery(
                    "select Hugo_Symbol, ID from {} where "
                    "CENTER = '{}'".format(bedSynId, self.center))
                bedDf = bed.asDataFrame()
                cnvDF["remapped"] = cnvDF["HUGO_SYMBOL"].apply(
                    lambda x: validateSymbol(x, bedDf))
                cnvDF = cnvDF[~cnvDF["remapped"].isnull()]

                # Do not allow any duplicated genes after symbols
                # have been remapped
                if sum(cnvDF["remapped"].duplicated()) > 0:
                    duplicated = cnvDF["remapped"].duplicated(keep=False)
                    total_error += (
                        "Your CNA file has duplicated Hugo_Symbols "
                        "(After remapping of genes): {} -> {}.\n".format(
                            ",".join(cnvDF["HUGO_SYMBOL"][duplicated]),
                            ",".join(cnvDF["remapped"][duplicated]),
                        ))
        return (total_error, warning)
Beispiel #5
0
    def _validate(self, fusionDF, nosymbol_check, project_id):
        total_error = ""
        warning = ""

        # Frame: "in-frame" or "frameshift".
        # Fusion_Status (OPTIONAL): An assessment of the mutation type (i.e., "SOMATIC", "GERMLINE", "UNKNOWN", or empty)

        fusionDF.columns = [col.upper() for col in fusionDF.columns]

        REQUIRED_HEADERS = pd.Series([
            "HUGO_SYMBOL",
            "ENTREZ_GENE_ID",
            "CENTER",
            "TUMOR_SAMPLE_BARCODE",
            "FUSION",
            "DNA_SUPPORT",
            "RNA_SUPPORT",
            "METHOD",
            "FRAME",
        ])
        if fusionDF.get("COMMENTS") is None:
            fusionDF["COMMENTS"] = float("nan")
        if not all(REQUIRED_HEADERS.isin(fusionDF.columns)):
            total_error += (
                "Your fusion file must at least have these headers: %s.\n" %
                ",".join(
                    REQUIRED_HEADERS[~REQUIRED_HEADERS.isin(fusionDF.columns)])
            )
        if (process_functions.checkColExist(fusionDF, "HUGO_SYMBOL")
                and not nosymbol_check):
            # logger.info("VALIDATING %s GENE SYMBOLS" % os.path.basename(filePath))
            # invalidated_genes = fusionDF["HUGO_SYMBOL"].drop_duplicates().apply(validateSymbol)
            databaseToSynIdMappingDf = process_functions.get_synid_database_mappingdf(
                self.syn, project_id)
            bedSynId = process_functions.getDatabaseSynId(
                self.syn,
                "bed",
                databaseToSynIdMappingDf=databaseToSynIdMappingDf)
            bed = self.syn.tableQuery(
                "select Hugo_Symbol, ID from %s where CENTER = '%s'" %
                (bedSynId, self.center))
            bedDf = bed.asDataFrame()
            # invalidated_genes = self.pool.map(process_functions.validateSymbol, fusionDF["HUGO_SYMBOL"].drop_duplicates())
            if fusionDF["HUGO_SYMBOL"].isnull().any():
                total_error += (
                    "Your fusion file should not have any NA/blank Hugo Symbols.\n"
                )
            # fusionDF = fusionDF.drop_duplicates("HUGO_SYMBOL").apply(lambda x: validateSymbol(x, bedDf), axis=1)

        # if process_functions.checkColExist(fusionDF, "DNA_SUPPORT"):
        #     if not fusionDF.DNA_SUPPORT.isin(["yes","no","unknown"]).all():
        #         total_error += "Your fusion file's DNA_SUPPORT column must be 'yes', 'no', or 'unknown'"

        # if process_functions.checkColExist(fusionDF, "RNA_SUPPORT"):
        #     if not fusionDF.RNA_SUPPORT.isin(["yes","no","unknown"]).all():
        #         total_error += "Your fusion file's RNA_SUPPORT column must be 'yes', 'no', or 'unknown'"

        # if process_functions.checkColExist(fusionDF, "FRAME"):
        #     if not fusionDF.FRAME.isin(["in-frame","frameshift"]).all():
        #         total_error += "Your fusion file's FRAME column must be 'in-frame', or 'frameshift'"

        return (total_error, warning)