def get_entity(q): # noqa: E501 """Obtain CURIE and synonym information about a search term # noqa: E501 :param q: A string to search by (name, abbreviation, CURIE, etc.). The parameter may be repeated for multiple search strings. :type q: List[str] :rtype: object """ synonymizer = NodeSynonymizer() response = synonymizer.get_normalizer_results(q) return response
def get_curie_names(curie: Union[str, List[str]], log: ARAXResponse) -> Dict[str, str]: curies = convert_to_list(curie) synonymizer = NodeSynonymizer() log.debug( f"Looking up names for {len(curies)} input curies using NodeSynonymizer" ) synonymizer_info = synonymizer.get_normalizer_results(curies) curie_to_name_map = dict() if synonymizer_info: recognized_input_curies = { input_curie for input_curie in synonymizer_info if synonymizer_info.get(input_curie) } unrecognized_curies = set(curies).difference(recognized_input_curies) if unrecognized_curies: log.warning( f"NodeSynonymizer did not recognize: {unrecognized_curies}") input_curies_without_matching_node = set() for input_curie in recognized_input_curies: equivalent_nodes = synonymizer_info[input_curie]["nodes"] # Find the 'node' in the synonymizer corresponding to this curie input_curie_nodes = [ node for node in equivalent_nodes if node["identifier"] == input_curie ] if not input_curie_nodes: # Try looking for slight variation (KG2 vs. SRI discrepancy): "KEGG:C02700" vs. "KEGG.COMPOUND:C02700" input_curie_stripped = input_curie.replace(".COMPOUND", "") input_curie_nodes = [ node for node in equivalent_nodes if node["identifier"] == input_curie_stripped ] # Record the name for this input curie if input_curie_nodes: curie_to_name_map[input_curie] = input_curie_nodes[0].get( "label") else: input_curies_without_matching_node.add(input_curie) if input_curies_without_matching_node: log.warning( f"No matching nodes found in NodeSynonymizer for these input curies: " f"{input_curies_without_matching_node}. Cannot determine their specific names." ) else: log.error(f"NodeSynonymizer returned None", error_code="NodeNormalizationIssue") return curie_to_name_map
def post_entity(body): # noqa: E501 """Obtain CURIE and synonym information about search terms # noqa: E501 :param body: List of terms to get information about :type body: :rtype: EntityQuery """ synonymizer = NodeSynonymizer() response = synonymizer.get_normalizer_results(body) return response
class MapCurieToOMOP: """This class is used to map the curie name to OMOP concept ids.""" #### Constructor def __init__(self, kg="KG1"): """Initialize the class instance. Args: kg (str, optional): the name of knowledge provider e.g. "KG1" or "KG2". Defaults to "KG1". """ kg = kg.upper() self.kg = kg self.get_synonyms_done = False self.synonymizer = NodeSynonymizer() ## set up the path of KGmetadata pre_path = os.path.sep.join( [*pathlist[:(RTXindex + 1)], 'data', 'KGmetadata']) if kg == "KG1": fpath = pre_path + "/NodeNamesDescriptions_KG1.tsv" elif kg == "KG2": fpath = pre_path + "/NodeNamesDescriptions_KG2.tsv" else: raise ValueError("The parameter 'kg' only accepts 'KG1' or 'KG2'") ## read KGmetadata try: self.kpdata = pd.read_csv(fpath, sep="\t", header=None, names=['curie', 'name', 'type']) except FileNotFoundError: raise FileNotFoundError( "Please go to $RTX/data/KGmetadata and run 'python3 KGNodeIndex.py -b' first" ) self.kpdata_dict = dict() for row_index in range(self.kpdata.shape[0]): if self.kpdata.loc[row_index, 'curie'] not in self.kpdata_dict: self.kpdata_dict[self.kpdata.loc[row_index, 'curie']] = { 'name': {self.kpdata.loc[row_index, 'name']}, 'type': {self.kpdata.loc[row_index, 'type']} } else: self.kpdata_dict[self.kpdata.loc[row_index, 'curie']]['name'].update([ self.kpdata.loc[row_index, 'name'] ]) self.kpdata_dict[self.kpdata.loc[row_index, 'curie']]['type'].update([ self.kpdata.loc[row_index, 'type'] ]) def get_synonyms(self, curie_type): """Get the synonyms of nodes with certain type. Args: curie_type (str or list): the type of curie nodes in specified knowledge provider (Required) e.g., "disease" or "phenotypic_feature" or ['disease', 'phenotypic_feature'] Returns: dict: a dict containing synonym list, name and type for each node """ ## check the input parameters if isinstance(curie_type, str): pass elif isinstance(curie_type, list): pass else: raise ValueError( "The parameter 'curie_type' should be str or list") ## use NodeSynonymizer to find the node synonyms and their biomedical vocabularies if isinstance(curie_type, str): if curie_type not in set(self.kpdata['type']): raise ValueError( f"The curie type '{curie_type}' is not a category in {self.kg}. Please check your spelling." ) else: self.synonyms_dict = dict() for curie in self.kpdata_dict: if curie_type in self.kpdata_dict[curie]['type']: res = self.synonymizer.get_normalizer_results( curie, kg_name=self.kg) synonym_list = list( set([ row['identifier'] for row in res[curie]['equivalent_identifiers'] ])) self.synonyms_dict[curie] = { 'name': self.kpdata_dict[curie]['name'], 'type': self.kpdata_dict[curie]['type'], 'synonyms': synonym_list } else: for index in range(len(curie_type)): type = curie_type[index] if type not in set(self.kpdata['type']): raise ValueError( f"The curie type '{type}' is not a category in {self.kg}. Please check your spelling." ) else: pass self.synonyms_dict = dict() for curie in self.kpdata_dict: if len(self.kpdata_dict[curie]['type'].intersection( set(curie_type))) > 0: res = self.synonymizer.get_normalizer_results( curie, kg_name=self.kg) synonym_list = list( set([ row['identifier'] for row in res[curie]['equivalent_identifiers'] ])) self.synonyms_dict[curie] = { 'name': self.kpdata_dict[curie]['name'], 'type': self.kpdata_dict[curie]['type'], 'synonyms': synonym_list } self.get_synonyms_done = True @staticmethod def change_format(synonym): """Change the format of synonym in order to match OMOP concept id. Args: synonym (str): a synonym of a curie name Returns: str: a new format of synonym of a curie name """ try: vocabulary_id, concept_code = synonym.split(':') except ValueError: vocabulary_id, concept_code = synonym.split(':')[1:] if vocabulary_id == "ICD-10": synonym = synonym.replace('ICD-10', 'ICD10') elif vocabulary_id == "ICD-9": synonym = synonym.replace('ICD-9', 'ICD9CM') elif vocabulary_id == "MESH": synonym = synonym.replace('MESH', 'MeSH') elif vocabulary_id == "RXNORM": synonym = synonym.replace('RXNORM', 'RxNorm') elif vocabulary_id == "SNOMEDCT": synonym = synonym.replace('SNOMEDCT', 'SNOMED') elif vocabulary_id == "SNOMEDCT_VET": synonym = synonym.replace('SNOMEDCT_VET', 'SNOMED') elif vocabulary_id == "MEDDRA": synonym = synonym.replace('MEDDRA', 'MedDRA') else: pass return synonym def _get_OMOP(self, synonym): synonym = MapCurieToOMOP.change_format(synonym) concept_id = self.concept_table_select.loc[ self.concept_table_select['curie_name'] == synonym, 'concept_id'] return list(concept_id) def call_oxo_API(self, key): """Call OxO (the EMBL-EBI Ontology Xref Service) API to find ontology mapping. Args: key (str): the curie name e.g. "DRUGBANK:DB05024", "CUI:C0876032", "CUI:C0908863" Returns: tuple: a tuple containing curie name and its corresponding OMOP concept ids if it has mapping. """ synonyms = [ MapCurieToOMOP.change_format(synonym) for synonym in self.synonyms_dict[key]['synonyms'] if synonym.split(":")[0] != "OMIM" and synonym.split(":")[0] != "Orphanet" and synonym.split(":")[0] != "CHEMBL.COMPOUND" ] # "OMIM" and "Orphanet" will cause 500 return status which is 'Internal Server Error' and "CHEMBL.COMPOUND" is not accepted by API. if len(synonyms) != 0: query_ids = ",".join(synonyms) dist = 1 res = requests.get( f"https://www.ebi.ac.uk/spot/oxo/api/search?format=json&ids={query_ids}&distance={dist}" ) if res.status_code == 200: res_curies = [ curie['curie'].upper() for item in res.json()['_embedded']['searchResults'] if len(item['mappingResponseList']) != 0 for curie in item['mappingResponseList'] ] if any([ True if curie_name in res_curies else False for curie_name in self.concept_table_select['curie_name'] ]): bool_list = [ True if curie_name in res_curies else False for curie_name in self.concept_table_select['curie_name'] ] return (key, list(self.concept_table_select['concept_id'] [bool_list])) else: dist = 2 res = requests.get( f"https://www.ebi.ac.uk/spot/oxo/api/search?format=json&ids={query_ids}&distance={dist}" ) if res.status_code == 200: res_curies = [ curie['curie'].upper() for item in res.json() ['_embedded']['searchResults'] if len(item['mappingResponseList']) != 0 for curie in item['mappingResponseList'] ] if any([ True if curie_name in res_curies else False for curie_name in self.concept_table_select['curie_name'] ]): bool_list = [ True if curie_name in res_curies else False for curie_name in self.concept_table_select['curie_name'] ] return (key, list( self.concept_table_select['concept_id'] [bool_list])) else: dist = 3 res = requests.get( f"https://www.ebi.ac.uk/spot/oxo/api/search?format=json&ids={query_ids}&distance={dist}" ) if res.status_code == 200: res_curies = [ curie['curie'].upper() for item in res.json()['_embedded']['searchResults'] if len(item['mappingResponseList']) != 0 for curie in item['mappingResponseList'] ] if any([ True if curie_name in res_curies else False for curie_name in self.concept_table_select['curie_name'] ]): bool_list = [ True if curie_name in res_curies else False for curie_name in self.concept_table_select['curie_name'] ] return (key, list(self.concept_table_select[ 'concept_id'][bool_list])) else: return (key, []) else: print( f"{key}\tError {res.status_code}: https://www.ebi.ac.uk/spot/oxo/api/search?format=json&ids={query_ids}&distance={dist}", flush=True) return (key, []) else: print( f"{key}\tError {res.status_code}: https://www.ebi.ac.uk/spot/oxo/api/search?format=json&ids={query_ids}&distance={dist}", flush=True) return (key, []) else: print( f"{key}\tError {res.status_code}: https://www.ebi.ac.uk/spot/oxo/api/search?format=json&ids={query_ids}&distance={dist}", flush=True) return (key, []) else: return (key, []) def map_curie_to_OMOP(self, pre_run_dict=None): """Map curies to OMOP ids based on the synonyms returned from NodeSynonymizer. Args: pre_run_dict (str or dict, optional): the path of result saved as pickle file returned from 'get_synonyms' method or the dict object returned from 'get_synonyms' method e.g. 'synonyms_kg1.pkl' or kg2_synonyms. Defaults to None. Returns: dict: a dict containing OMOP id list,synonym list, name and type for each node. """ if pre_run_dict is None: if not self.get_synonyms_done: print( f"Please run 'get_synonyms' method first before run this method." ) return {} else: if isinstance(pre_run_dict, str): if os.path.exists(pre_run_dict): with open(pre_run_dict, 'rb') as file: self.synonyms_dict = pickle.load(file) else: print(f"Can't find {pre_run_dict}") return {} elif isinstance(pre_run_dict, dict): self.synonyms_dict = pre_run_dict else: print(f"The parameter 'pre_run_dict' is not a str or a dict.") return {} infolder = os.path.sep.join([ *pathlist[:(RTXindex + 1)], 'code', 'ARAX', 'KnowledgeSources', 'COHD_local', 'data' ]) try: infile_path = infolder + '/Athena_tables/ALL_CONCEPT_filtered.txt' concept_table = pd.read_csv(infile_path, sep='\t', index_col=None) except FileNotFoundError: print(f"Can't find {infile_path}") return {} # select = ['ATC', 'CVX', 'HCPCS', 'ICD10', 'ICD10CM', 'ICD9CM', 'MeSH', 'NDFRT', 'RxNorm', 'SNOMED', 'MedDRA'] # concept_table_select = concepts_table.loc[[concepts_table.loc[index, 'vocabulary_id'] in select for index in range(concepts_table.shape[0])], ['concept_id', 'vocabulary_id', 'concept_code']] concept_table_select = concept_table.loc[:, [ 'concept_id', 'vocabulary_id', 'concept_code' ]] concept_table_select['curie_name'] = concept_table_select[[ 'vocabulary_id', 'concept_code' ]].apply(lambda x: str(x[0]).upper() + ":" + str(x[1]).upper(), axis=1) concept_table_select.drop(columns=['vocabulary_id', 'concept_code']) self.concept_table_select = concept_table_select.drop( columns=['vocabulary_id', 'concept_code']) for key in self.synonyms_dict: print(key, flush=True) synonym_list = self.synonyms_dict[key]['synonyms'] OMOP_concept_list = [ elem for elem in itertools.chain.from_iterable([ OMOP_list for OMOP_list in map(self._get_OMOP, synonym_list) ]) ] self.synonyms_dict[key]["concept_ids"] = list( set(OMOP_concept_list)) if len(list(set(OMOP_concept_list))) == 0: key, OMOP_concept_list = self.call_oxo_API(key) self.synonyms_dict[key]["concept_ids"] = list( set(OMOP_concept_list))