def __init__(self, args: Union["Namespace", None] = None, conf_file: str = "config.ini"): self.path = conf_file super().__init__() self.read(self.path) self.logfile_path = self["general"]["logfile"] self.dump_intermediate = self["general"].getboolean( "dump_intermediate") self.input = self.parse_input(args) self.output = self.parse_output(args) if args.single: self.dump_intermediate = False if args.lab: LAB_INST.configure(**self.lab_options(args.lab)) elif not (args.lab or args.single): LAB_INST.configure(**self.pedia_lab_options) # configure api components ERRORFIXER_INST.configure(**self.errorfixer_options) JANNOVAR_INST.configure(**self.jannovar_options) OMIM_INST.configure(**self.omim_options) PHENOMIZER_INST.configure(**self.phenomizer_options)
def __init__(self, args: Union["Namespace", None] = None, conf_file: str = "config.ini"): self.path = args.config super().__init__() self.read(self.path) self.dump_intermediate = self["general"].getboolean( "dump_intermediate") self.data_path = self["general"]["data_path"] if self["general"][ "data_path"] else "data" self.train_pickle = args.train_pickle_path if self["classifier"]["train_pickle_path"]: self.train_pickle = self["classifier"]["train_pickle_path"] self.param_c = args.param_c if self["classifier"]["param_c"]: self.param_c = self["classifier"]["param_c"] self.input = self.parse_input(args) self.output = self.parse_output(args) if args.output and args.single: filename = os.path.basename(args.single) case_id = filename.split('.json')[0] log_dir = os.path.join(args.output, 'logs/{}/'.format(case_id)) if not os.path.exists(log_dir): os.makedirs(log_dir, exist_ok=True) self.logfile_path = os.path.join(log_dir, 'preprocess.log') else: self.logfile_path = self["general"]["logfile"] if args.single: self.dump_intermediate = False if args.lab: LAB_INST.configure(**self.lab_options(args.lab)) elif not (args.lab or args.single): LAB_INST.configure(**self.pedia_lab_options) # check phenomizer if self["phenomizer"]["user"] and self["phenomizer"]["password"]: self.use_phenomizer = True else: self.use_phenomizer = False # configure api components ERRORFIXER_INST.configure(**self.errorfixer_options) JANNOVAR_INST.configure(**self.jannovar_options) OMIM_INST.configure(**self.omim_options) PHENOMIZER_INST.configure(**self.phenomizer_options)
def check(self) -> bool: '''Check whether Case fulfills all provided criteria. The criteria are: picture has been provided - gestalt_score in detected_syndromes should be greater than 0 clinical diagnosis - selected_syndromes should not be empty single monogenetic disease - not multiple syndromes selected and not multiple pathogenic mutations in different genes SNP mutations - no microdel/dup or other large scale aberrations Note that this function is more definitive than the json level check, as the validity of hgvs parsing has already been established. ''' valid = True issues = [] # check if there is at least one feature (HPO) features = self.features if len(features) < 1: issues.append({ "type": "NO_FEATURES", }) valid = False scores = ["gestalt_score"] max_scores = {s: max(self.phenomized[s]) for s in scores} zero_scores = [s for s, n in max_scores.items() if n <= 0] # check maximum gestalt score if zero_scores: issues.append({"type": "MISSING_SCORES", "data": max_scores}) valid = False # check that only one syndrome has been selected diagnosis = self.phenomized.loc[self.phenomized['confirmed']] if len(diagnosis) < 1: issues.append({ "type": "NO_DIAGNOSIS", }) valid = False diagnosis = pandas.concat( [diagnosis, self.phenomized.loc[self.phenomized["differential"]]]) # check whether multiple diagnoses are in same phenotypic series ps_dict = {} for _, diag in diagnosis.iterrows(): ps_res = OMIM_INST.omim_id_to_phenotypic_series( str(diag["omim_id"])) or str(diag["omim_id"]) # ignore entries without omim id if ps_res == '0': continue if diag["syndrome_name"] in ps_dict: ps_dict[diag["syndrome_name"]].add(ps_res) else: ps_dict[diag["syndrome_name"]] = set([ps_res]) # compact ps_dict based on omim ids reduced_ps_dict = {} for key, series in ps_dict.items(): contained = False for other_key, other_series in ps_dict.items(): if other_key == key: continue if series <= other_series: contained = True if not contained: reduced_ps_dict[key] = series if len(reduced_ps_dict) > 1: issues.append({ "type": "MULTI_DIAGNOSIS", "data": { "orig": list(diagnosis["omim_id"]), 'names': list(reduced_ps_dict.keys()), "converted_ids": [e for v in reduced_ps_dict.values() for e in v] } }) valid = False if not self.get_variants(): raw_entries = self.data.get_genomic_entries() if not raw_entries: issues.append({ "type": "NO_GENOMIC", }) else: issues.append({"type": "MALFORMED_HGVS", "data": raw_entries}) valid = False else: # Check if there are multiple different disease-causing genes raw_entries = self.data.get_genomic_entries() entries = self.hgvs_models if len(entries) > 1: genes = [ entry.gene['gene_id'] for entry in entries if entry.gene['gene_id'] ] if len(set(genes)) > 1: issues.append({ "type": "MULTI_DIFFERENT_DISEASE_CAUSING_GENE", "data": raw_entries }) valid = False return valid, issues
def _create_gene_list(self) -> [dict]: '''Get a list of genes from the detected syndrome by inferring gene phenotype mappings from the phenomizer and OMIM. ''' syndromes = self.phenomized.to_dict("records") phenotypic_series_mapping = {} for syndrome in syndromes: disease_id = syndrome["omim_id"] phenotypic_series = OMIM_INST.omim_id_to_phenotypic_series( str(disease_id)) or str(disease_id) syndrome_name = (syndrome["syndrome_name"] or syndrome["disease-name_pheno"] or syndrome["disease-name_boqa"]) genes = list(OMIM_INST.mim_pheno_to_gene(disease_id).values()) if "gene-id" in syndrome and syndrome["gene-id"]: genes += [{ "gene_id": eid, "gene_symbol": OMIM_INST.entrez_id_to_symbol(eid), "gene_omim_id": OMIM_INST.entrez_id_to_mim_gene(eid) } for eid in syndrome["gene-id"].split(", ") if eid not in [g["gene_id"] for g in genes]] for gene in genes: if not gene["gene_id"]: continue # uniqueness constraint on disease_id and # gene_id key = "{}|{}".format(syndrome_name, gene["gene_id"]) pheno_score = syndrome["pheno_score"] \ if "pheno_score" in syndrome else 0.0 boqa_score = syndrome["boqa_score"] \ if "boqa_score" in syndrome else 0.0 update_data = dict( { "disease_id": disease_id, "phenotypic_series": phenotypic_series, "syndrome_name": syndrome_name, "gestalt_score": syndrome["gestalt_score"], "feature_score": syndrome["feature_score"], "combined_score": syndrome["combined_score"], "pheno_score": pheno_score, "boqa_score": boqa_score, "has_mask": syndrome["has_mask"], }, **gene) if key in phenotypic_series_mapping: # use the largest scores of two identical mappings phenotypic_series_mapping[key] = { k: (max(v, update_data[k]) if not isinstance(v, str) else update_data[k] or v) for k, v in phenotypic_series_mapping[key].items() } else: phenotypic_series_mapping[key] = update_data return list(phenotypic_series_mapping.values())
def get_syndrome_suggestions_and_diagnosis(self) -> pandas.DataFrame: '''Return a pandas dataframe containing all suggested syndromes and the selected syndroms, which is joined on the table with the confirmed column marking the specific entry. ''' # create a dataframe from the list of detected syndromes if self._js["case_data"]["suggested_syndromes"]: syndromes_df = pandas.DataFrame.from_dict( self.convert_lab_suggested_syndrome( self._js["case_data"]['suggested_syndromes'])) else: syndromes_df = pandas.DataFrame(columns=[ "omim_id", "gestalt_score", "combined_score", "feature_score", "has_mask", "syndrome_name" ]) # force omim_id to always be a list, required for exploding the df syndromes_df['omim_id'] = syndromes_df['omim_id'].apply( OMIM_INST.replace_deprecated_all) # turn omim_list into multiple rows with other properties duplicated syndromes_df = explode_df_column(syndromes_df, 'omim_id') syndromes_df['omim_id'] = syndromes_df['omim_id'].astype(int) # preprare the confirmed diagnosis for joining with the main syndrome # dataframe if self._js['case_data']['selected_syndromes']: selected_syndromes = [ dict(s, omim_id=OMIM_INST.replace_deprecated_all(s["omim_id"]) or ["0"]) for s in self.convert_lab_selected_syndrome( self._js["case_data"]["selected_syndromes"]) ] selected = pandas.DataFrame.from_dict(selected_syndromes) syndromes_df['omim_id'] = syndromes_df['omim_id'].astype(int) # create multiple rows from list of omim_id entries duplicating # other information selected = explode_df_column(selected, 'omim_id') selected['omim_id'] = selected['omim_id'].astype(int) # add a confirmed diagnosis column selected.loc[ selected["diagnosis"].isin(constants.CONFIRMED_DIAGNOSIS), 'confirmed'] = True selected.loc[ selected["diagnosis"].isin(constants.DIFFERENTIAL_DIAGNOSIS), 'differential'] = True # outer join of the syndrome and the confirmed diagnosis # pandas.merge has to be used instead of join, because the latter # only joins on indices syndromes_df = syndromes_df.merge(selected, on=['omim_id', 'syndrome_name'], how='outer') # set all entries not present in the selected syndromes to not # confirmed syndromes_df = syndromes_df.fillna({'confirmed': False}) # merge has_mask syndromes_df["has_mask"] = \ syndromes_df["has_mask_x"].astype(bool) \ | syndromes_df["has_mask_y"].astype(bool) syndromes_df.drop(["has_mask_x", "has_mask_y"], inplace=True, axis=1) # reset index for continous indexing after the join and explode # operations syndromes_df = syndromes_df.reset_index(drop=True) else: # if no syndromes selected, everything is false syndromes_df["confirmed"] = False syndromes_df["differential"] = False syndromes_df['omim_id'] = syndromes_df['omim_id'].astype(int) return syndromes_df