Exemple #1
0
    def __init__(self,
                 args: Union["Namespace", None] = None,
                 conf_file: str = "config.ini"):
        self.path = conf_file
        super().__init__()
        self.read(self.path)

        self.logfile_path = self["general"]["logfile"]
        self.dump_intermediate = self["general"].getboolean(
            "dump_intermediate")
        self.input = self.parse_input(args)

        self.output = self.parse_output(args)

        if args.single:
            self.dump_intermediate = False

        if args.lab:
            LAB_INST.configure(**self.lab_options(args.lab))
        elif not (args.lab or args.single):
            LAB_INST.configure(**self.pedia_lab_options)

        # configure api components
        ERRORFIXER_INST.configure(**self.errorfixer_options)
        JANNOVAR_INST.configure(**self.jannovar_options)
        OMIM_INST.configure(**self.omim_options)
        PHENOMIZER_INST.configure(**self.phenomizer_options)
    def __init__(self,
                 args: Union["Namespace", None] = None,
                 conf_file: str = "config.ini"):
        self.path = args.config
        super().__init__()
        self.read(self.path)

        self.dump_intermediate = self["general"].getboolean(
            "dump_intermediate")
        self.data_path = self["general"]["data_path"] if self["general"][
            "data_path"] else "data"

        self.train_pickle = args.train_pickle_path
        if self["classifier"]["train_pickle_path"]:
            self.train_pickle = self["classifier"]["train_pickle_path"]

        self.param_c = args.param_c
        if self["classifier"]["param_c"]:
            self.param_c = self["classifier"]["param_c"]

        self.input = self.parse_input(args)

        self.output = self.parse_output(args)
        if args.output and args.single:
            filename = os.path.basename(args.single)
            case_id = filename.split('.json')[0]
            log_dir = os.path.join(args.output, 'logs/{}/'.format(case_id))
            if not os.path.exists(log_dir):
                os.makedirs(log_dir, exist_ok=True)

            self.logfile_path = os.path.join(log_dir, 'preprocess.log')
        else:
            self.logfile_path = self["general"]["logfile"]

        if args.single:
            self.dump_intermediate = False

        if args.lab:
            LAB_INST.configure(**self.lab_options(args.lab))
        elif not (args.lab or args.single):
            LAB_INST.configure(**self.pedia_lab_options)

        # check phenomizer
        if self["phenomizer"]["user"] and self["phenomizer"]["password"]:
            self.use_phenomizer = True
        else:
            self.use_phenomizer = False

        # configure api components
        ERRORFIXER_INST.configure(**self.errorfixer_options)
        JANNOVAR_INST.configure(**self.jannovar_options)
        OMIM_INST.configure(**self.omim_options)
        PHENOMIZER_INST.configure(**self.phenomizer_options)
Exemple #3
0
    def check(self) -> bool:
        '''Check whether Case fulfills all provided criteria.

        The criteria are:
            picture has been provided - gestalt_score in detected_syndromes
            should be greater than 0
            clinical diagnosis - selected_syndromes should not be empty
            single monogenetic disease - not multiple syndromes selected and
            not multiple pathogenic mutations in different genes
            SNP mutations - no microdel/dup or other large scale aberrations

        Note that this function is more definitive than the json level check,
        as the validity of hgvs parsing has already been established.
        '''
        valid = True
        issues = []

        # check if there is at least one feature (HPO)
        features = self.features
        if len(features) < 1:
            issues.append({
                "type": "NO_FEATURES",
            })
            valid = False

        scores = ["gestalt_score"]
        max_scores = {s: max(self.phenomized[s]) for s in scores}
        zero_scores = [s for s, n in max_scores.items() if n <= 0]

        # check maximum gestalt score
        if zero_scores:
            issues.append({"type": "MISSING_SCORES", "data": max_scores})
            valid = False

        # check that only one syndrome has been selected
        diagnosis = self.phenomized.loc[self.phenomized['confirmed']]
        if len(diagnosis) < 1:
            issues.append({
                "type": "NO_DIAGNOSIS",
            })
            valid = False

        diagnosis = pandas.concat(
            [diagnosis, self.phenomized.loc[self.phenomized["differential"]]])

        # check whether multiple diagnoses are in same phenotypic series
        ps_dict = {}
        for _, diag in diagnosis.iterrows():
            ps_res = OMIM_INST.omim_id_to_phenotypic_series(
                str(diag["omim_id"])) or str(diag["omim_id"])
            # ignore entries without omim id
            if ps_res == '0':
                continue
            if diag["syndrome_name"] in ps_dict:
                ps_dict[diag["syndrome_name"]].add(ps_res)
            else:
                ps_dict[diag["syndrome_name"]] = set([ps_res])
        # compact ps_dict based on omim ids
        reduced_ps_dict = {}

        for key, series in ps_dict.items():
            contained = False
            for other_key, other_series in ps_dict.items():
                if other_key == key:
                    continue
                if series <= other_series:
                    contained = True
            if not contained:
                reduced_ps_dict[key] = series

        if len(reduced_ps_dict) > 1:
            issues.append({
                "type": "MULTI_DIAGNOSIS",
                "data": {
                    "orig":
                    list(diagnosis["omim_id"]),
                    'names':
                    list(reduced_ps_dict.keys()),
                    "converted_ids":
                    [e for v in reduced_ps_dict.values() for e in v]
                }
            })
            valid = False

        if not self.get_variants():
            raw_entries = self.data.get_genomic_entries()
            if not raw_entries:
                issues.append({
                    "type": "NO_GENOMIC",
                })
            else:
                issues.append({"type": "MALFORMED_HGVS", "data": raw_entries})
            valid = False
        else:
            # Check if there are multiple different disease-causing genes
            raw_entries = self.data.get_genomic_entries()
            entries = self.hgvs_models
            if len(entries) > 1:
                genes = [
                    entry.gene['gene_id'] for entry in entries
                    if entry.gene['gene_id']
                ]
                if len(set(genes)) > 1:
                    issues.append({
                        "type": "MULTI_DIFFERENT_DISEASE_CAUSING_GENE",
                        "data": raw_entries
                    })
                    valid = False

        return valid, issues
Exemple #4
0
    def _create_gene_list(self) -> [dict]:
        '''Get a list of genes from the detected syndrome by inferring
        gene phenotype mappings from the phenomizer and OMIM.
        '''
        syndromes = self.phenomized.to_dict("records")

        phenotypic_series_mapping = {}
        for syndrome in syndromes:

            disease_id = syndrome["omim_id"]

            phenotypic_series = OMIM_INST.omim_id_to_phenotypic_series(
                str(disease_id)) or str(disease_id)

            syndrome_name = (syndrome["syndrome_name"]
                             or syndrome["disease-name_pheno"]
                             or syndrome["disease-name_boqa"])

            genes = list(OMIM_INST.mim_pheno_to_gene(disease_id).values())
            if "gene-id" in syndrome and syndrome["gene-id"]:
                genes += [{
                    "gene_id": eid,
                    "gene_symbol": OMIM_INST.entrez_id_to_symbol(eid),
                    "gene_omim_id": OMIM_INST.entrez_id_to_mim_gene(eid)
                } for eid in syndrome["gene-id"].split(", ")
                          if eid not in [g["gene_id"] for g in genes]]

            for gene in genes:
                if not gene["gene_id"]:
                    continue

                # uniqueness constraint on disease_id and
                # gene_id
                key = "{}|{}".format(syndrome_name, gene["gene_id"])
                pheno_score = syndrome["pheno_score"] \
                    if "pheno_score" in syndrome else 0.0
                boqa_score = syndrome["boqa_score"] \
                    if "boqa_score" in syndrome else 0.0
                update_data = dict(
                    {
                        "disease_id": disease_id,
                        "phenotypic_series": phenotypic_series,
                        "syndrome_name": syndrome_name,
                        "gestalt_score": syndrome["gestalt_score"],
                        "feature_score": syndrome["feature_score"],
                        "combined_score": syndrome["combined_score"],
                        "pheno_score": pheno_score,
                        "boqa_score": boqa_score,
                        "has_mask": syndrome["has_mask"],
                    }, **gene)
                if key in phenotypic_series_mapping:
                    # use the largest scores of two identical mappings
                    phenotypic_series_mapping[key] = {
                        k: (max(v, update_data[k])
                            if not isinstance(v, str) else update_data[k] or v)
                        for k, v in phenotypic_series_mapping[key].items()
                    }
                else:
                    phenotypic_series_mapping[key] = update_data

        return list(phenotypic_series_mapping.values())
    def get_syndrome_suggestions_and_diagnosis(self) -> pandas.DataFrame:
        '''Return a pandas dataframe containing all suggested syndromes and the
        selected syndroms, which is joined on the table with the confirmed
        column marking the specific entry.
        '''
        # create a dataframe from the list of detected syndromes
        if self._js["case_data"]["suggested_syndromes"]:
            syndromes_df = pandas.DataFrame.from_dict(
                self.convert_lab_suggested_syndrome(
                    self._js["case_data"]['suggested_syndromes']))
        else:
            syndromes_df = pandas.DataFrame(columns=[
                "omim_id", "gestalt_score", "combined_score", "feature_score",
                "has_mask", "syndrome_name"
            ])

        # force omim_id to always be a list, required for exploding the df
        syndromes_df['omim_id'] = syndromes_df['omim_id'].apply(
            OMIM_INST.replace_deprecated_all)
        # turn omim_list into multiple rows with other properties duplicated
        syndromes_df = explode_df_column(syndromes_df, 'omim_id')
        syndromes_df['omim_id'] = syndromes_df['omim_id'].astype(int)

        # preprare the confirmed diagnosis for joining with the main syndrome
        # dataframe
        if self._js['case_data']['selected_syndromes']:
            selected_syndromes = [
                dict(s,
                     omim_id=OMIM_INST.replace_deprecated_all(s["omim_id"])
                     or ["0"]) for s in self.convert_lab_selected_syndrome(
                         self._js["case_data"]["selected_syndromes"])
            ]

            selected = pandas.DataFrame.from_dict(selected_syndromes)
            syndromes_df['omim_id'] = syndromes_df['omim_id'].astype(int)

            # create multiple rows from list of omim_id entries duplicating
            # other information
            selected = explode_df_column(selected, 'omim_id')
            selected['omim_id'] = selected['omim_id'].astype(int)
            # add a confirmed diagnosis column
            selected.loc[
                selected["diagnosis"].isin(constants.CONFIRMED_DIAGNOSIS),
                'confirmed'] = True
            selected.loc[
                selected["diagnosis"].isin(constants.DIFFERENTIAL_DIAGNOSIS),
                'differential'] = True

            # outer join of the syndrome and the confirmed diagnosis
            # pandas.merge has to be used instead of join, because the latter
            # only joins on indices
            syndromes_df = syndromes_df.merge(selected,
                                              on=['omim_id', 'syndrome_name'],
                                              how='outer')
            # set all entries not present in the selected syndromes to not
            # confirmed
            syndromes_df = syndromes_df.fillna({'confirmed': False})
            # merge has_mask
            syndromes_df["has_mask"] = \
                syndromes_df["has_mask_x"].astype(bool) \
                | syndromes_df["has_mask_y"].astype(bool)
            syndromes_df.drop(["has_mask_x", "has_mask_y"],
                              inplace=True,
                              axis=1)
            # reset index for continous indexing after the join and explode
            # operations
            syndromes_df = syndromes_df.reset_index(drop=True)
        else:
            # if no syndromes selected, everything is false
            syndromes_df["confirmed"] = False
            syndromes_df["differential"] = False

        syndromes_df['omim_id'] = syndromes_df['omim_id'].astype(int)
        return syndromes_df