def __init__(self, pubmed_directory_path, is_test): self.pubmed_directory_path = pubmed_directory_path self.conceptname_to_pmids_db_path = "conceptname_to_pmids.db" self.curie_to_pmids_db_path = "curie_to_pmids.sqlite" self.status = 'OK' self.synonymizer = NodeSynonymizer() self.is_test = is_test
def get_canonical_curies_list(curie: Union[str, List[str]], log: ARAXResponse) -> List[str]: curies = convert_to_list(curie) try: synonymizer = NodeSynonymizer() log.debug(f"Sending NodeSynonymizer.get_canonical_curies() a list of {len(curies)} curies") canonical_curies_dict = synonymizer.get_canonical_curies(curies) log.debug(f"Got response back from NodeSynonymizer") except Exception: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() log.error(f"Encountered a problem using NodeSynonymizer: {tb}", error_code=error_type.__name__) return [] else: if canonical_curies_dict is not None: recognized_input_curies = {input_curie for input_curie in canonical_curies_dict if canonical_curies_dict.get(input_curie)} unrecognized_curies = set(curies).difference(recognized_input_curies) if unrecognized_curies: log.warning(f"NodeSynonymizer did not return canonical info for: {unrecognized_curies}") canonical_curies = {canonical_curies_dict[recognized_curie].get('preferred_curie') for recognized_curie in recognized_input_curies} # Include any original curies we weren't able to find a canonical version for canonical_curies.update(unrecognized_curies) if not canonical_curies: log.error(f"Final list of canonical curies is empty. This shouldn't happen!", error_code="CanonicalCurieIssue") return list(canonical_curies) else: log.error(f"NodeSynonymizer returned None", error_code="NodeNormalizationIssue") return []
def estimate_percent_nodes_with_mesh_mapping_via_synonymizer(kg: str): print( f"Estimating the percent of {kg} nodes mappable to a MESH curie via NodeSynonymizer" ) percentages_with_mesh = [] num_batches = 20 batch_size = 4000 for number in range(num_batches): print(f" Batch {number + 1}") # Get random selection of node IDs from the KG random_node_ids = _get_random_node_ids(batch_size, kg) # Use synonymizer to get their equivalent curies and check for a MESH term print(f" Getting equivalent curies for those random node IDs..") synonymizer = NodeSynonymizer() curie_synonym_info = synonymizer.get_equivalent_curies( list(random_node_ids), kg_name='KG2') num_curies_with_mesh_term = 0 for input_curie, synonym_curies in curie_synonym_info.items(): if synonym_curies: if any(curie for curie in synonym_curies if curie.startswith('MESH')): num_curies_with_mesh_term += 1 percentage_with_mesh = (num_curies_with_mesh_term / len(random_node_ids)) * 100 print( f" {percentage_with_mesh}% of nodes had a synonym MESH term in this batch." ) percentages_with_mesh.append(percentage_with_mesh) print(f" Percentages for all batches: {percentages_with_mesh}.") average = sum(percentages_with_mesh) / len(percentages_with_mesh) print( f"Final estimate of {kg} nodes mappable to a MESH term via NodeSynonymizer: {round(average)}%" )
def get_preferred_categories(curie: Union[str, List[str]], log: ARAXResponse) -> Optional[List[str]]: curies = convert_to_list(curie) synonymizer = NodeSynonymizer() log.debug( f"Sending NodeSynonymizer.get_canonical_curies() a list of {len(curies)} curies" ) canonical_curies_dict = synonymizer.get_canonical_curies(curies) log.debug(f"Got response back from NodeSynonymizer") if canonical_curies_dict is not None: recognized_input_curies = { input_curie for input_curie in canonical_curies_dict if canonical_curies_dict.get(input_curie) } unrecognized_curies = set(curies).difference(recognized_input_curies) if unrecognized_curies: log.warning( f"NodeSynonymizer did not recognize: {unrecognized_curies}") preferred_categories = { canonical_curies_dict[recognized_curie].get('preferred_category') for recognized_curie in recognized_input_curies } if preferred_categories: return list(preferred_categories) else: log.warning( f"Unable to find any preferred categories; will default to biolink:NamedThing" ) return ["biolink:NamedThing"] else: log.error(f"NodeSynonymizer returned None", error_code="NodeNormalizationIssue") return []
def get_canonical_curies_dict(curie: Union[str, List[str]], log: ARAXResponse) -> Dict[str, Dict[str, str]]: curies = convert_string_or_list_to_list(curie) try: synonymizer = NodeSynonymizer() log.debug( f"Sending NodeSynonymizer.get_canonical_curies() a list of {len(curies)} curies" ) canonical_curies_dict = synonymizer.get_canonical_curies(curies) log.debug(f"Got response back from NodeSynonymizer") except Exception: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() log.error(f"Encountered a problem using NodeSynonymizer: {tb}", error_code=error_type.__name__) return {} else: if canonical_curies_dict is not None: unrecognized_curies = { input_curie for input_curie in canonical_curies_dict if not canonical_curies_dict.get(input_curie) } if unrecognized_curies: log.warning( f"NodeSynonymizer did not return canonical info for: {unrecognized_curies}" ) return canonical_curies_dict else: log.error(f"NodeSynonymizer returned None", error_code="NodeNormalizationIssue") return {}
def _get_node_synonyms(knowledge_graph): synonymizer = NodeSynonymizer() node_keys = {key for key in knowledge_graph.nodes.keys()} equivalent_curie_info = synonymizer.get_equivalent_nodes(node_keys) return { node_key: set(equivalent_curies_dict) for node_key, equivalent_curies_dict in equivalent_curie_info.items() }
def estimate_percent_nodes_covered_by_backup_method(kg: str): print( f"Estimating the percent of {kg} nodes mappable by the 'backup' NGD method (uses eUtils)" ) backup_ngd = NormGoogleDistance() synonymizer = NodeSynonymizer() percentages_mapped = [] num_batches = 10 batch_size = 10 for number in range(num_batches): print(f" Batch {number + 1}") # Get random selection of nodes from the KG query = f"match (a) return a.id, a.name, rand() as r order by r limit {batch_size}" results = _run_cypher_query(query, kg) canonical_curie_info = synonymizer.get_canonical_curies( [result['a.id'] for result in results]) recognized_curies = { input_curie for input_curie in canonical_curie_info if canonical_curie_info.get(input_curie) } # Use the back-up NGD method to try to grab PMIDs for each num_with_pmids = 0 for curie in recognized_curies: # Try to map this to a MESH term using the backup method (the chokepoint) node_id = canonical_curie_info[curie].get('preferred_curie') node_name = canonical_curie_info[curie].get('preferred_name') node_type = canonical_curie_info[curie].get('preferred_type') try: pmids = backup_ngd.get_pmids_for_all([node_id], [node_name]) except Exception: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() print(f"ERROR using back-up method: {tb}") else: if len(pmids) and ([ pmid_list for pmid_list in pmids if pmid_list ]): num_with_pmids += 1 print( f" Found {len(pmids[0])} PMIDs for {node_id}, {node_name}." ) else: print(f" Not found. ({node_id}, {node_name})") percentage_with_pmids = (num_with_pmids / len(recognized_curies)) * 100 print( f" {percentage_with_pmids}% of nodes were mapped to PMIDs using backup method." ) percentages_mapped.append(percentage_with_pmids) print(f" Percentages for all batches: {percentages_mapped}.") average = sum(percentages_mapped) / len(percentages_mapped) print( f"Final estimate of backup method's coverage of {kg} nodes: {round(average)}%" )
def _get_node_synonyms(knowledge_graph): synonymizer = NodeSynonymizer() node_ids = {node.id for node in knowledge_graph.nodes} equivalent_curie_info = synonymizer.get_equivalent_nodes(node_ids, kg_name='KG2') return { node_id: set(equivalent_curies_dict) for node_id, equivalent_curies_dict in equivalent_curie_info.items() }
def _canonicalize_nodes(kg2pre_nodes: List[Dict[str, any]]) -> Tuple[Dict[str, Dict[str, any]], Dict[str, str]]: logging.info(f"Canonicalizing nodes..") synonymizer = NodeSynonymizer() node_ids = [node.get('id') for node in kg2pre_nodes if node.get('id')] logging.info(f" Sending NodeSynonymizer.get_canonical_curies() {len(node_ids)} curies..") canonicalized_info = synonymizer.get_canonical_curies(curies=node_ids, return_all_categories=True) all_canonical_curies = {canonical_info['preferred_curie'] for canonical_info in canonicalized_info.values() if canonical_info} logging.info(f" Sending NodeSynonymizer.get_equivalent_nodes() {len(all_canonical_curies)} curies..") equivalent_curies_info = synonymizer.get_equivalent_nodes(all_canonical_curies) recognized_curies = {curie for curie in equivalent_curies_info if equivalent_curies_info.get(curie)} equivalent_curies_dict = {curie: list(equivalent_curies_info.get(curie)) for curie in recognized_curies} with open(f"{KG2C_DIR}/equivalent_curies.pickle", "wb") as equiv_curies_dump: # Save these for use by downstream script pickle.dump(equivalent_curies_dict, equiv_curies_dump, protocol=pickle.HIGHEST_PROTOCOL) logging.info(f" Creating canonicalized nodes..") curie_map = dict() canonicalized_nodes = dict() for kg2pre_node in kg2pre_nodes: # Grab relevant info for this node and its canonical version canonical_info = canonicalized_info.get(kg2pre_node['id']) canonicalized_curie = canonical_info.get('preferred_curie', kg2pre_node['id']) if canonical_info else kg2pre_node['id'] publications = kg2pre_node['publications'] if kg2pre_node.get('publications') else [] descriptions_list = [kg2pre_node['description']] if kg2pre_node.get('description') else [] if canonicalized_curie in canonicalized_nodes: # Merge this node into its corresponding canonical node existing_canonical_node = canonicalized_nodes[canonicalized_curie] existing_canonical_node['publications'] = _merge_two_lists(existing_canonical_node['publications'], publications) existing_canonical_node['all_names'] = _merge_two_lists(existing_canonical_node['all_names'], [kg2pre_node['name']]) existing_canonical_node['descriptions_list'] = _merge_two_lists(existing_canonical_node['descriptions_list'], descriptions_list) # Make sure any nodes subject to #1074-like problems still appear in equivalent curies existing_canonical_node['equivalent_curies'] = _merge_two_lists(existing_canonical_node['equivalent_curies'], [kg2pre_node['id']]) # Add the IRI for the 'preferred' curie, if we've found that node if kg2pre_node['id'] == canonicalized_curie: existing_canonical_node['iri'] = kg2pre_node.get('iri') else: # Initiate the canonical node for this synonym group name = canonical_info['preferred_name'] if canonical_info else kg2pre_node['name'] category = canonical_info['preferred_category'] if canonical_info else kg2pre_node['category'] all_categories = list(canonical_info['all_categories']) if canonical_info else [kg2pre_node['category']] iri = kg2pre_node['iri'] if kg2pre_node['id'] == canonicalized_curie else None all_names = [kg2pre_node['name']] canonicalized_node = _create_node(preferred_curie=canonicalized_curie, name=name, category=category, all_categories=all_categories, publications=publications, equivalent_curies=equivalent_curies_dict.get(canonicalized_curie, [canonicalized_curie]), iri=iri, description=None, descriptions_list=descriptions_list, all_names=all_names) canonicalized_nodes[canonicalized_node['id']] = canonicalized_node curie_map[kg2pre_node['id']] = canonicalized_curie # Record this mapping for easy lookup later logging.info(f"Number of KG2pre nodes was reduced to {len(canonicalized_nodes)} " f"({round((len(canonicalized_nodes) / len(kg2pre_nodes)) * 100)}%)") return canonicalized_nodes, curie_map
def estimate_percent_nodes_covered_by_ultrafast_ngd(kg: str): print( f"Estimating the percent of {kg} nodes covered by the local NGD system.." ) curie_to_pmid_db = SqliteDict(f"./curie_to_pmids.sqlite") percentages_mapped = [] num_batches = 20 batch_size = 4000 all_nodes_mapped_by_type = dict() for number in range(num_batches): # Get random selection of node IDs from the KG random_node_ids = _get_random_node_ids(batch_size, kg) # Use synonymizer to get their canonicalized info synonymizer = NodeSynonymizer() canonical_curie_info = synonymizer.get_canonical_curies( list(random_node_ids)) recognized_curies = { input_curie for input_curie in canonical_curie_info if canonical_curie_info.get(input_curie) } # See if those canonical curies are in our local database num_mapped_to_pmids = 0 for input_curie in recognized_curies: canonical_curie = canonical_curie_info[input_curie].get( 'preferred_curie') preferred_type = canonical_curie_info[input_curie].get( 'preferred_type') if preferred_type not in all_nodes_mapped_by_type: all_nodes_mapped_by_type[preferred_type] = { 'covered': 0, 'not_covered': 0 } if canonical_curie and canonical_curie in curie_to_pmid_db: num_mapped_to_pmids += 1 all_nodes_mapped_by_type[preferred_type]['covered'] += 1 else: all_nodes_mapped_by_type[preferred_type]['not_covered'] += 1 percentage_mapped = (num_mapped_to_pmids / len(random_node_ids)) * 100 percentages_mapped.append(percentage_mapped) average = sum(percentages_mapped) / len(percentages_mapped) print(f"Estimated coverage of {kg} nodes: {round(average)}%.") node_type_percentages_dict = dict() for node_type, coverage_info in all_nodes_mapped_by_type.items(): num_covered = coverage_info['covered'] num_total = coverage_info['covered'] + coverage_info['not_covered'] percentage = round((num_covered / num_total) * 100) node_type_percentages_dict[node_type] = percentage for node_type, percentage in sorted(node_type_percentages_dict.items(), key=lambda item: item[1], reverse=True): print(f" {node_type}: {percentage}%")
def __init__(self, kg="KG1"): """Initialize the class instance. Args: kg (str, optional): the name of knowledge provider e.g. "KG1" or "KG2". Defaults to "KG1". """ kg = kg.upper() self.kg = kg self.get_synonyms_done = False self.synonymizer = NodeSynonymizer() ## set up the path of KGmetadata pre_path = os.path.sep.join( [*pathlist[:(RTXindex + 1)], 'data', 'KGmetadata']) if kg == "KG1": fpath = pre_path + "/NodeNamesDescriptions_KG1.tsv" elif kg == "KG2": fpath = pre_path + "/NodeNamesDescriptions_KG2.tsv" else: raise ValueError("The parameter 'kg' only accepts 'KG1' or 'KG2'") ## read KGmetadata try: self.kpdata = pd.read_csv(fpath, sep="\t", header=None, names=['curie', 'name', 'type']) except FileNotFoundError: raise FileNotFoundError( "Please go to $RTX/data/KGmetadata and run 'python3 KGNodeIndex.py -b' first" ) self.kpdata_dict = dict() for row_index in range(self.kpdata.shape[0]): if self.kpdata.loc[row_index, 'curie'] not in self.kpdata_dict: self.kpdata_dict[self.kpdata.loc[row_index, 'curie']] = { 'name': {self.kpdata.loc[row_index, 'name']}, 'type': {self.kpdata.loc[row_index, 'type']} } else: self.kpdata_dict[self.kpdata.loc[row_index, 'curie']]['name'].update([ self.kpdata.loc[row_index, 'name'] ]) self.kpdata_dict[self.kpdata.loc[row_index, 'curie']]['type'].update([ self.kpdata.loc[row_index, 'type'] ])
def __init__(self, pubmed_directory_path, is_test, live="Production"): self.RTXConfig = RTXConfiguration() self.RTXConfig.live = live ngd_filepath = os.path.sep.join([ *pathlist[:(RTXindex + 1)], 'code', 'ARAX', 'KnowledgeSources', 'NormalizedGoogleDistance' ]) self.pubmed_directory_path = pubmed_directory_path self.conceptname_to_pmids_db_path = "conceptname_to_pmids.db" self.curie_to_pmids_db_path = f"{ngd_filepath}{os.path.sep}{self.RTXConfig.curie_to_pmids_path.split('/')[-1]}" self.status = 'OK' self.synonymizer = NodeSynonymizer() self.is_test = is_test
def get_entity(q): # noqa: E501 """Obtain CURIE and synonym information about a search term # noqa: E501 :param q: A string to search by (name, abbreviation, CURIE, etc.). The parameter may be repeated for multiple search strings. :type q: List[str] :rtype: object """ synonymizer = NodeSynonymizer() response = synonymizer.get_normalizer_results(q) return response
def get_curie_names(curie: Union[str, List[str]], log: ARAXResponse) -> Dict[str, str]: curies = convert_to_list(curie) synonymizer = NodeSynonymizer() log.debug( f"Looking up names for {len(curies)} input curies using NodeSynonymizer" ) synonymizer_info = synonymizer.get_normalizer_results(curies) curie_to_name_map = dict() if synonymizer_info: recognized_input_curies = { input_curie for input_curie in synonymizer_info if synonymizer_info.get(input_curie) } unrecognized_curies = set(curies).difference(recognized_input_curies) if unrecognized_curies: log.warning( f"NodeSynonymizer did not recognize: {unrecognized_curies}") input_curies_without_matching_node = set() for input_curie in recognized_input_curies: equivalent_nodes = synonymizer_info[input_curie]["nodes"] # Find the 'node' in the synonymizer corresponding to this curie input_curie_nodes = [ node for node in equivalent_nodes if node["identifier"] == input_curie ] if not input_curie_nodes: # Try looking for slight variation (KG2 vs. SRI discrepancy): "KEGG:C02700" vs. "KEGG.COMPOUND:C02700" input_curie_stripped = input_curie.replace(".COMPOUND", "") input_curie_nodes = [ node for node in equivalent_nodes if node["identifier"] == input_curie_stripped ] # Record the name for this input curie if input_curie_nodes: curie_to_name_map[input_curie] = input_curie_nodes[0].get( "label") else: input_curies_without_matching_node.add(input_curie) if input_curies_without_matching_node: log.warning( f"No matching nodes found in NodeSynonymizer for these input curies: " f"{input_curies_without_matching_node}. Cannot determine their specific names." ) else: log.error(f"NodeSynonymizer returned None", error_code="NodeNormalizationIssue") return curie_to_name_map
def post_entity(body): # noqa: E501 """Obtain CURIE and synonym information about search terms # noqa: E501 :param body: List of terms to get information about :type body: :rtype: EntityQuery """ synonymizer = NodeSynonymizer() response = synonymizer.get_normalizer_results(body) return response
def __init__(self, is_test): logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s', handlers=[ logging.FileHandler("ngdbuild.log"), logging.StreamHandler() ]) self.pubmed_directory_path = f"{NGD_DIR}/pubmed_xml_files" self.conceptname_to_pmids_db_name = "conceptname_to_pmids.db" self.conceptname_to_pmids_db_path = f"{NGD_DIR}/{self.conceptname_to_pmids_db_name}" self.curie_to_pmids_db_name = "curie_to_pmids.sqlite" self.curie_to_pmids_db_path = f"{NGD_DIR}/{self.curie_to_pmids_db_name}" self.status = 'OK' self.synonymizer = NodeSynonymizer() self.is_test = is_test
def _get_canonical_curies_map(self, curies): self.response.debug(f"Canonicalizing curies of relevant nodes using NodeSynonymizer") synonymizer = NodeSynonymizer() try: canonicalized_node_info = synonymizer.get_canonical_curies(curies) except Exception: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(f"Encountered a problem using NodeSynonymizer: {tb}", error_code=error_type.__name__) return {} else: canonical_curies_map = dict() for input_curie, node_info in canonicalized_node_info.items(): if node_info: canonical_curies_map[input_curie] = node_info.get('preferred_curie', input_curie) else: canonical_curies_map[input_curie] = input_curie return canonical_curies_map
def get_entity_by_string(search_string): # noqa: E501 """Obtain the CURIE and type of some entity by name # noqa: E501 :param search_string: Some string to search by (name, abbreviation, CURIE, etc.) :type search_string: str :rtype: List[object] """ synonymizer = NodeSynonymizer() result = synonymizer.get_canonical_curies(curies=search_string, names=search_string) response = {} if result[search_string] is not None: response = { 'curie': result[search_string]['preferred_curie'], 'name': result[search_string]['preferred_name'], 'type': result[search_string]['preferred_type'] } return response
def __init__(self, response, message, parameters): self.response = response self.message = message self.parameters = parameters self.global_iter = 0 ## check if the new model files exists in /predictor/retrain_data. If not, scp it from arax.rtx.ai pathlist = os.path.realpath(__file__).split(os.path.sep) RTXindex = pathlist.index("RTX") filepath = os.path.sep.join([*pathlist[:(RTXindex + 1)], 'code', 'ARAX', 'ARAXQuery', 'Overlay', 'predictor','retrain_data']) ## check if there is LogModel.pkl pkl_file = f"{filepath}/LogModel.pkl" if os.path.exists(pkl_file): pass else: os.system("scp [email protected]:/home/ubuntu/drug_repurposing_model_retrain/LogModel.pkl " + pkl_file) ## check if there is GRAPH.sqlite db_file = f"{filepath}/GRAPH.sqlite" if os.path.exists(db_file): pass else: os.system("scp [email protected]:/home/ubuntu/drug_repurposing_model_retrain/GRAPH.sqlite " + db_file) # use NodeSynonymizer to replace map.txt # check if there is map.txt # map_file = f"{filepath}/map.txt" # if os.path.exists(map_file): # pass # else: # os.system("scp [email protected]:/home/ubuntu/drug_repurposing_model_retrain/map.txt " + map_file) self.pred = predictor(model_file=pkl_file) self.pred.import_file(None, graph_database=db_file) # with open(map_file, 'r') as infile: # map_file_content = infile.readlines() # map_file_content.pop(0) ## remove title # self.known_curies = set(line.strip().split('\t')[0] for line in map_file_content) self.synonymizer = NodeSynonymizer()
def get_curie_synonyms(curie: Union[str, List[str]], log: ARAXResponse) -> List[str]: curies = convert_string_or_list_to_list(curie) try: synonymizer = NodeSynonymizer() log.debug( f"Sending NodeSynonymizer.get_equivalent_nodes() a list of {len(curies)} curies" ) equivalent_curies_dict = synonymizer.get_equivalent_nodes( curies, kg_name="KG2") log.debug(f"Got response back from NodeSynonymizer") except Exception: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() log.error(f"Encountered a problem using NodeSynonymizer: {tb}", error_code=error_type.__name__) return [] else: if equivalent_curies_dict is not None: curies_missing_info = { curie for curie in equivalent_curies_dict if not equivalent_curies_dict.get(curie) } if curies_missing_info: log.warning( f"NodeSynonymizer did not find any equivalent curies for: {curies_missing_info}" ) equivalent_curies = { curie for curie_dict in equivalent_curies_dict.values() if curie_dict for curie in curie_dict } all_curies = equivalent_curies.union(set( curies)) # Make sure even curies without synonyms are included return sorted(list(all_curies)) else: log.error(f"NodeSynonymizer returned None", error_code="NodeNormalizationIssue") return []
def get_curie_synonyms_dict( curie: Union[str, List[str]], log: Optional[ARAXResponse] = ARAXResponse() ) -> Dict[str, List[str]]: curies = convert_to_list(curie) try: synonymizer = NodeSynonymizer() log.debug( f"Sending NodeSynonymizer.get_equivalent_nodes() a list of {len(curies)} curies" ) equivalent_curies_dict = synonymizer.get_equivalent_nodes(curies) log.debug(f"Got response back from NodeSynonymizer") except Exception: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() log.error(f"Encountered a problem using NodeSynonymizer: {tb}", error_code=error_type.__name__) return dict() else: if equivalent_curies_dict is not None: curies_missing_info = { curie for curie in equivalent_curies_dict if not equivalent_curies_dict.get(curie) } if curies_missing_info: log.warning( f"NodeSynonymizer did not find any equivalent curies for: {curies_missing_info}" ) final_curie_dict = dict() for input_curie in curies: curie_dict = equivalent_curies_dict.get(input_curie) final_curie_dict[input_curie] = list( curie_dict) if curie_dict else [input_curie] return final_curie_dict else: log.error(f"NodeSynonymizer returned None", error_code="NodeNormalizationIssue") return dict()
def size_of_given_type_in_KP(self, node_type, use_cypher_command=False, kg='KG1'): """ find all nodes of a certain type in KP :param node_type: the query node type :param use_cypher_command: Boolean (True or False). If True, it used cypher command to query all nodes otherwise used NodeSynonymizer :param kg: only allowed for choosing 'KG1' or 'KG2' now. Will extend to BTE later """ # TODO: extend this to KG2, BTE, and other KP's we know of size_of_total = None if kg == 'KG1' or kg == 'KG2': pass else: self.response.error(f"Only KG1 or KG2 is allowable to calculate the Fisher's exact test temporally") return size_of_total if kg == 'KG1': if use_cypher_command: rtxConfig = RTXConfiguration() # Connection information for the neo4j server, populated with orangeboard driver = GraphDatabase.driver(rtxConfig.neo4j_bolt, auth=basic_auth(rtxConfig.neo4j_username, rtxConfig.neo4j_password)) session = driver.session() query = "MATCH (n:%s) return count(distinct n)" % (node_type) res = session.run(query) size_of_total = res.single()["count(distinct n)"] return size_of_total else: nodesynonymizer = NodeSynonymizer() size_of_total = nodesynonymizer.get_total_entity_count(node_type, kg_name=kg) return size_of_total else: if use_cypher_command: self.response.warning(f"KG2 is only allowable to use NodeSynonymizer to query the total number of node with query type. It was set to use kgNodeIndex") nodesynonymizer = NodeSynonymizer() size_of_total = nodesynonymizer.get_total_entity_count(node_type, kg_name=kg) return size_of_total else: nodesynonymizer = NodeSynonymizer() size_of_total = nodesynonymizer.get_total_entity_count(node_type, kg_name=kg) return size_of_total
default= '~/RTX/code/ARAX/KnowledgeSources/COHD_local/data/preferred_synonyms_kg2_5_0.pkl' ) args = parser.parse_args() curie_type = eval(args.CurieType) NodeNamesDescriptions = pd.read_csv( args.NodeDescriptionFile, sep='\t', header=None, names=['curie', 'name', 'full_name', 'type']) NodeNamesDescriptions = NodeNamesDescriptions.loc[ NodeNamesDescriptions.type.isin(curie_type), :].reset_index(drop=True) preferred_synonyms = dict() synonymizer = NodeSynonymizer() for curie in NodeNamesDescriptions['curie']: preferred_curie = synonymizer.get_canonical_curies(curies=curie)[curie] if preferred_curie is None: print(f"{curie} doesn't have preferred curies", flush=True) else: if preferred_curie['preferred_curie'] not in preferred_synonyms: preferred_synonyms[preferred_curie['preferred_curie']] = dict() preferred_synonyms[preferred_curie['preferred_curie']][ 'preferred_name'] = preferred_curie['preferred_name'] preferred_synonyms[preferred_curie['preferred_curie']][ 'preferred_type'] = preferred_curie['preferred_category'] preferred_synonyms[ preferred_curie['preferred_curie']]['synonyms'] = [curie] else:
def assess(self, message): #### Define a default response response = ARAXResponse() self.response = response self.message = message response.debug(f"Assessing the QueryGraph for basic information") #### Get shorter handles query_graph = message.query_graph nodes = query_graph.nodes edges = query_graph.edges #### Store number of nodes and edges self.n_nodes = len(nodes) self.n_edges = len(edges) response.debug(f"Found {self.n_nodes} nodes and {self.n_edges} edges") #### Handle impossible cases if self.n_nodes == 0: response.error( "QueryGraph has 0 nodes. At least 1 node is required", error_code="QueryGraphZeroNodes") return response if self.n_nodes == 1 and self.n_edges > 0: response.error( "QueryGraph may not have edges if there is only one node", error_code="QueryGraphTooManyEdges") return response #if self.n_nodes == 2 and self.n_edges > 1: # response.error("QueryGraph may not have more than 1 edge if there are only 2 nodes", error_code="QueryGraphTooManyEdges") # return response #### Loop through nodes computing some stats node_info = {} self.node_category_map = {} for key, qnode in nodes.items(): node_info[key] = { 'key': key, 'node_object': qnode, 'has_id': False, 'category': qnode.category, 'has_category': False, 'is_set': False, 'n_edges': 0, 'n_links': 0, 'is_connected': False, 'edges': [], 'edge_dict': {} } if qnode.id is not None: node_info[key]['has_id'] = True #### If the user did not specify a category, but there is a curie, try to figure out the category if node_info[key]['category'] is None: synonymizer = NodeSynonymizer() curie = qnode.id curies_list = qnode.id if isinstance(qnode.id, list): curie = qnode.id[0] else: curies_list = [qnode.id] canonical_curies = synonymizer.get_canonical_curies( curies=curies_list, return_all_categories=True) if curie in canonical_curies and 'preferred_type' in canonical_curies[ curie]: node_info[key]['has_category'] = True node_info[key]['category'] = canonical_curies[curie][ 'preferred_type'] if qnode.category is not None: node_info[key]['has_category'] = True #if qnode.is_set is not None: node_info[key]['is_set'] = True if key is None: response.error( "QueryGraph has a node with null key. This is not permitted", error_code="QueryGraphNodeWithNoId") return response #### Remap the node categorys from unsupported to supported if qnode.category is not None: qnode.category = self.remap_node_category(qnode.category) #### Store lookup of categorys warning_counter = 0 if qnode.category is None or (isinstance(qnode.category, list) and len(qnode.category) == 0): if warning_counter == 0: #response.debug("QueryGraph has nodes with no category. This may cause problems with results inference later") pass warning_counter += 1 self.node_category_map['unknown'] = key else: category = qnode.category if isinstance(qnode.category, list): category = qnode.category[ 0] # FIXME this is a hack prior to proper list handling self.node_category_map[category] = key #### Loop through edges computing some stats edge_info = {} self.edge_predicate_map = {} unique_links = {} #### Ignore special informationational edges for now. virtual_edge_predicates = { 'has_normalized_google_distance_with': 1, 'has_fisher_exact_test_p-value_with': 1, 'has_jaccard_index_with': 1, 'probably_treats': 1, 'has_paired_concept_frequency_with': 1, 'has_observed_expected_ratio_with': 1, 'has_chi_square_with': 1 } for key, qedge in edges.items(): predicate = qedge.predicate if isinstance(predicate, list): if len(predicate) == 0: predicate = None else: predicate = predicate[ 0] # FIXME Hack before dealing with predicates as lists! if predicate is not None and predicate in virtual_edge_predicates: continue edge_info[key] = { 'key': key, 'has_predicate': False, 'subject': qedge.subject, 'object': qedge.object, 'predicate': None } if predicate is not None: edge_info[key]['has_predicate'] = True edge_info[key]['predicate'] = predicate if key is None: response.error( "QueryGraph has a edge with null key. This is not permitted", error_code="QueryGraphEdgeWithNoKey") return response #### Create a unique node link string link_string = ','.join(sorted([qedge.subject, qedge.object])) if link_string not in unique_links: node_info[qedge.subject]['n_links'] += 1 node_info[qedge.object]['n_links'] += 1 unique_links[link_string] = 1 #print(link_string) node_info[qedge.subject]['n_edges'] += 1 node_info[qedge.object]['n_edges'] += 1 node_info[qedge.subject]['is_connected'] = True node_info[qedge.object]['is_connected'] = True #node_info[qedge.subject]['edges'].append(edge_info[key]) #node_info[qedge.object]['edges'].append(edge_info[key]) node_info[qedge.subject]['edges'].append(edge_info[key]) node_info[qedge.object]['edges'].append(edge_info[key]) node_info[qedge.subject]['edge_dict'][key] = edge_info[key] node_info[qedge.object]['edge_dict'][key] = edge_info[key] #### Store lookup of predicates warning_counter = 0 edge_predicate = 'any' if predicate is None: if warning_counter == 0: response.debug( "QueryGraph has edges with no predicate. This may cause problems with results inference later" ) warning_counter += 1 else: edge_predicate = predicate #### It's not clear yet whether we need to store the whole sentence or just the predicate #predicate_encoding = f"{node_info[qedge.subject]['predicate']}---{edge_predicate}---{node_info[qedge.object]['predicate']}" predicate_encoding = edge_predicate self.edge_predicate_map[predicate_encoding] = key #### Loop through the nodes again, trying to identify the start_node and the end_node singletons = [] for node_id, node_data in node_info.items(): if node_data['n_links'] < 2: singletons.append(node_data) elif node_data['n_links'] > 2: self.is_bifurcated_graph = True response.warning( "QueryGraph appears to have a fork in it. This might cause trouble" ) #### If this doesn't produce any singletons, then try curie based selection if len(singletons) == 0: for node_id, node_data in node_info.items(): if node_data['has_id']: singletons.append(node_data) #### If this doesn't produce any singletons, then we don't know how to continue if len(singletons) == 0: response.error("Unable to understand the query graph", error_code="QueryGraphCircular") return response #### Try to identify the start_node and the end_node start_node = singletons[0] if len(nodes) == 1: # Just a single node, fine pass elif len(singletons) < 2: response.warning( "QueryGraph appears to be circular or has a strange geometry. This might cause trouble" ) elif len(singletons) > 2: response.warning( "QueryGraph appears to have a fork in it. This might cause trouble" ) else: if singletons[0]['has_id'] is True and singletons[1][ 'has_id'] is False: start_node = singletons[0] elif singletons[0]['has_id'] is False and singletons[1][ 'has_id'] is True: start_node = singletons[1] else: start_node = singletons[0] #### Hmm, that's not very robust against odd graphs. This needs work. FIXME self.node_info = node_info self.edge_info = edge_info self.start_node = start_node current_node = start_node node_order = [start_node] edge_order = [] edges = current_node['edges'] debug = False while 1: if debug: tmp = { 'astate': '1', 'current_node': current_node, 'node_order': node_order, 'edge_order': edge_order, 'edges': edges } print( json.dumps(ast.literal_eval(repr(tmp)), sort_keys=True, indent=2)) print( '==================================================================================' ) tmp = input() if len(edges) == 0: break #if len(edges) > 1: if current_node['n_links'] > 1: response.error( f"Help, two edges at A583. Don't know what to do: {current_node['n_links']}", error_code="InteralErrorA583") return response edge_order.append(edges[0]) previous_node = current_node if edges[0]['subject'] == current_node['key']: current_node = node_info[edges[0]['object']] elif edges[0]['object'] == current_node['key']: current_node = node_info[edges[0]['subject']] else: response.error("Help, edge error A584. Don't know what to do", error_code="InteralErrorA584") return response node_order.append(current_node) #tmp = { 'astate': '2', 'current_node': current_node, 'node_order': node_order, 'edge_order': edge_order, 'edges': edges } #print(json.dumps(ast.literal_eval(repr(tmp)),sort_keys=True,indent=2)) #print('==================================================================================') #tmp = input() edges = current_node['edges'] new_edges = [] for edge in edges: key = edge['key'] if key not in previous_node['edge_dict']: new_edges.append(edge) edges = new_edges if len(edges) == 0: break #tmp = { 'astate': '3', 'current_node': current_node, 'node_order': node_order, 'edge_order': edge_order, 'edges': edges } #print(json.dumps(ast.literal_eval(repr(tmp)),sort_keys=True,indent=2)) #print('==================================================================================') #tmp = input() self.node_order = node_order self.edge_order = edge_order # Create a text rendering of the QueryGraph geometry for matching against a template self.query_graph_templates = { 'simple': '', 'detailed': { 'n_nodes': len(node_order), 'components': [] } } node_index = 0 edge_index = 0 #print(json.dumps(ast.literal_eval(repr(node_order)),sort_keys=True,indent=2)) for node in node_order: component_id = f"n{node_index:02}" content = '' component = { 'component_type': 'node', 'component_id': component_id, 'has_id': node['has_id'], 'has_category': node['has_category'], 'category_value': None } self.query_graph_templates['detailed']['components'].append( component) if node['has_id']: content = 'id' elif node['has_category'] and node[ 'node_object'].category is not None: content = f"category={node['node_object'].category}" component['category_value'] = node['node_object'].category elif node['has_category']: content = 'category' template_part = f"{component_id}({content})" self.query_graph_templates['simple'] += template_part # Since queries with intermediate nodes that are not is_set=true tend to blow up, for now, make them is_set=true unless explicitly set to false if node_index > 0 and node_index < (self.n_nodes - 1): if 'is_set' not in node or node['is_set'] is None: node['node_object'].is_set = True response.warning( f"Setting unspecified is_set to true for {node['key']} because this will probably lead to a happier result" ) elif node['is_set'] is True: response.debug( f"Value for is_set is already true for {node['key']} so that's good" ) elif node['is_set'] is False: #response.info(f"Value for is_set is set to false for intermediate node {node['key']}. This could lead to weird results. Consider setting it to true") response.info( f"Value for is_set is false for intermediate node {node['key']}. Setting to true because this will probably lead to a happier result" ) node['node_object'].is_set = True #else: # response.error(f"Unrecognized value is_set='{node['is_set']}' for {node['key']}. This should be true or false") node_index += 1 if node_index < self.n_nodes: #print(json.dumps(ast.literal_eval(repr(node)),sort_keys=True,indent=2)) #### Extract the has_predicate and predicate_value from the edges of the node #### This could fail if there are two edges coming out of the node FIXME has_predicate = False predicate_value = None if 'edges' in node: for related_edge in node['edges']: if related_edge['subject'] == node['key']: has_predicate = related_edge['has_predicate'] if has_predicate is True and 'predicate' in related_edge: predicate_value = related_edge['predicate'] component_id = f"e{edge_index:02}" template_part = f"-{component_id}()-" self.query_graph_templates['simple'] += template_part component = { 'component_type': 'edge', 'component_id': component_id, 'has_id': False, 'has_predicate': has_predicate, 'predicate_value': predicate_value } self.query_graph_templates['detailed']['components'].append( component) edge_index += 1 response.debug( f"The QueryGraph reference template is: {self.query_graph_templates['simple']}" ) #tmp = { 'node_info': node_info, 'edge_info': edge_info, 'start_node': start_node, 'n_nodes': self.n_nodes, 'n_edges': self.n_edges, # 'is_bifurcated_graph': self.is_bifurcated_graph, 'node_order': node_order, 'edge_order': edge_order } #print(json.dumps(ast.literal_eval(repr(tmp)),sort_keys=True,indent=2)) #sys.exit(0) #### Return the response return response
def __init__(self, response, message, parameters): self.response = response self.message = message self.parameters = parameters self.global_iter = 0 ## check if the new model files exists in /predictor/retrain_data. If not, scp it from arax.ncats.io pathlist = os.path.realpath(__file__).split(os.path.sep) RTXindex = pathlist.index("RTX") filepath = os.path.sep.join([*pathlist[:(RTXindex + 1)], 'code', 'ARAX', 'ARAXQuery', 'Overlay', 'predictor','retrain_data']) ## check if there is LogModel.pkl pkl_file = f"{filepath}/LogModel.pkl" if os.path.exists(pkl_file): pass else: os.system("scp [email protected]:/data/orangeboard/databases/KG2.3.4/LogModel.pkl " + pkl_file) ## check if there is GRAPH.sqlite db_file = f"{filepath}/GRAPH.sqlite" if os.path.exists(db_file): pass else: os.system("scp [email protected]:/data/orangeboard/databases/KG2.3.4/GRAPH.sqlite " + db_file) ## check if there is DTD_probability_database.db DTD_prob_db_file = f"{filepath}/DTD_probability_database_v1.0.db" if os.path.exists(DTD_prob_db_file): pass else: os.system("scp [email protected]:/data/orangeboard/databases/KG2.3.4/DTD_probability_database_v1.0.db " + DTD_prob_db_file) # use NodeSynonymizer to replace map.txt # check if there is map.txt # map_file = f"{filepath}/map.txt" # if os.path.exists(map_file): # pass # else: # os.system("scp [email protected]:/home/ubuntu/drug_repurposing_model_retrain/map.txt " + map_file) self.use_prob_db = True if self.use_prob_db is True: try: self.pred = predictor(DTD_prob_file=DTD_prob_db_file, use_prob_db=True) except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Internal Error encountered connecting to the local DTD prediction database.") else: try: self.pred = predictor(model_file=pkl_file, use_prob_db=False) except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Internal Error encountered connecting to the local LogModel.pkl file.") try: self.pred.import_file(None, graph_database=db_file) except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Internal Error encountered connecting to the local graph database file.") # with open(map_file, 'r') as infile: # map_file_content = infile.readlines() # map_file_content.pop(0) ## remove title # self.known_curies = set(line.strip().split('\t')[0] for line in map_file_content) self.synonymizer = NodeSynonymizer()
class PredictDrugTreatsDisease: #### Constructor def __init__(self, response, message, parameters): self.response = response self.message = message self.parameters = parameters self.global_iter = 0 ## check if the new model files exists in /predictor/retrain_data. If not, scp it from arax.ncats.io pathlist = os.path.realpath(__file__).split(os.path.sep) RTXindex = pathlist.index("RTX") filepath = os.path.sep.join([*pathlist[:(RTXindex + 1)], 'code', 'ARAX', 'ARAXQuery', 'Overlay', 'predictor','retrain_data']) ## check if there is LogModel.pkl pkl_file = f"{filepath}/LogModel.pkl" if os.path.exists(pkl_file): pass else: os.system("scp [email protected]:/data/orangeboard/databases/KG2.3.4/LogModel.pkl " + pkl_file) ## check if there is GRAPH.sqlite db_file = f"{filepath}/GRAPH.sqlite" if os.path.exists(db_file): pass else: os.system("scp [email protected]:/data/orangeboard/databases/KG2.3.4/GRAPH.sqlite " + db_file) ## check if there is DTD_probability_database.db DTD_prob_db_file = f"{filepath}/DTD_probability_database_v1.0.db" if os.path.exists(DTD_prob_db_file): pass else: os.system("scp [email protected]:/data/orangeboard/databases/KG2.3.4/DTD_probability_database_v1.0.db " + DTD_prob_db_file) # use NodeSynonymizer to replace map.txt # check if there is map.txt # map_file = f"{filepath}/map.txt" # if os.path.exists(map_file): # pass # else: # os.system("scp [email protected]:/home/ubuntu/drug_repurposing_model_retrain/map.txt " + map_file) self.use_prob_db = True if self.use_prob_db is True: try: self.pred = predictor(DTD_prob_file=DTD_prob_db_file, use_prob_db=True) except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Internal Error encountered connecting to the local DTD prediction database.") else: try: self.pred = predictor(model_file=pkl_file, use_prob_db=False) except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Internal Error encountered connecting to the local LogModel.pkl file.") try: self.pred.import_file(None, graph_database=db_file) except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Internal Error encountered connecting to the local graph database file.") # with open(map_file, 'r') as infile: # map_file_content = infile.readlines() # map_file_content.pop(0) ## remove title # self.known_curies = set(line.strip().split('\t')[0] for line in map_file_content) self.synonymizer = NodeSynonymizer() def convert_to_trained_curies(self, input_curie): """ Takes an input curie from the KG, uses the synonymizer, and then returns something that the map.csv can handle """ normalizer_result = self.synonymizer.get_canonical_curies(input_curie) curies_in_model = normalizer_result[input_curie] # curies_in_model = [curie for curie in curies_in_model if curie in self.known_curies] # equivalent_curies = [] # start with empty equivalent_curies # try: # equivalent_curies = [x['identifier'] for x in normalizer_result[input_curie]['equivalent_identifiers']] # except: # self.response.warning(f"NodeSynonmizer could not find curies for {input_curie}, skipping this one.") # for curie in equivalent_curies: # curie_prefix = curie.split(':')[0] # # FIXME: fix this when re-training the ML model, as when this was originally trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123 # if curie_prefix == "CHEMBL.COMPOUND": # chembl_fix = 'ChEMBL:' + curie[22:] # if chembl_fix in self.known_curies: # curies_in_model.add(chembl_fix) # elif curie in self.known_curies: # curies_in_model.add(curie) return curies_in_model def predict_drug_treats_disease(self): """ Iterate over all the edges in the knowledge graph, add the drug-disease treatment probability for appropriate edges on the edge_attributes :return: response """ parameters = self.parameters self.response.debug(f"Computing drug disease treatment probability based on a machine learning model") self.response.info(f"Computing drug disease treatment probability based on a machine learning model: See [this publication](https://doi.org/10.1101/765305) for more details about how this is accomplished.") attribute_name = "probability_treats" attribute_type = "EDAM:data_0951" value = 0 # this will be the default value. If the model returns 0, or the default is there, don't include that edge url = "https://doi.org/10.1101/765305" # if you want to add virtual edges, identify the source/targets, decorate the edges, add them to the KG, and then add one to the QG corresponding to them if 'virtual_relation_label' in parameters: source_curies_to_decorate = set() target_curies_to_decorate = set() curie_to_name = dict() # identify the nodes that we should be adding virtual edges for for node_key, node in self.message.knowledge_graph.nodes.items(): if hasattr(node, 'qnode_keys'): if parameters['subject_qnode_key'] in node.qnode_keys: if "drug" in node.category or "chemical_substance" in node.category or "biolink:Drug" in node.category or "biolink:ChemicalSubstance" in node.category: # this is now NOT checked by ARAX_overlay source_curies_to_decorate.add(node_key) curie_to_name[node_key] = node.name if parameters['object_qnode_key'] in node.qnode_keys: if "disease" in node.category or "phenotypic_feature" in node.category or "biolink:Disease" in node.category or "biolink:PhenotypicFeature" in node.category: # this is now NOT checked by ARAX_overlay target_curies_to_decorate.add(node_key) curie_to_name[node_key] = node.name added_flag = False # check to see if any edges where added # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute for (source_curie, target_curie) in itertools.product(source_curies_to_decorate, target_curies_to_decorate): self.response.debug(f"Predicting probability that {curie_to_name[source_curie]} treats {curie_to_name[target_curie]}") # create the edge attribute if it can be # loop over all equivalent curies and take the highest probability max_probability = 0 converted_source_curie = self.convert_to_trained_curies(source_curie) if converted_source_curie is None: continue else: preferred_type = converted_source_curie['preferred_type'] if preferred_type == "drug" or preferred_type == "chemical_substance" or preferred_type == "biolink:Drug" or preferred_type == "biolink:ChemicalSubstance": converted_source_curie = converted_source_curie['preferred_curie'] else: continue converted_target_curie = self.convert_to_trained_curies(target_curie) if converted_target_curie is None: continue else: preferred_type = converted_target_curie['preferred_type'] if preferred_type == "disease" or preferred_type == "phenotypic_feature" or preferred_type == "biolink:Disease" or preferred_type == "biolink:PhenotypicFeature": converted_target_curie = converted_target_curie['preferred_curie'] else: continue if self.use_prob_db is True: probability = self.pred.get_prob_from_DTD_db(converted_source_curie, converted_target_curie) if probability is not None: if np.isfinite(probability): max_probability = probability else: probability = self.pred.prob_single(converted_source_curie, converted_target_curie) if probability is not None: probability = probability[0] if np.isfinite(probability): max_probability = probability # if len(res) != 0: # all_probabilities = self.pred.prob_all(res) # if isinstance(all_probabilities, list): # max_probability = max([value for value in all_probabilities if np.isfinite(value)]) value = max_probability #probability = self.pred.prob_single('ChEMBL:' + source_curie[22:], target_curie) # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123 #if probability and np.isfinite(probability): # finite, that's ok, otherwise, stay with default # value = probability[0] edge_attribute = EdgeAttribute(type=attribute_type, name=attribute_name, value=str(value), url=url) # populate the edge attribute if edge_attribute and value != 0: added_flag = True # make the edge, add the attribute # edge properties now = datetime.now() edge_type = "biolink:probably_treats" qedge_keys = [parameters['virtual_relation_label']] relation = parameters['virtual_relation_label'] is_defined_by = "ARAX" defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S") provided_by = "ARAX" confidence = None weight = None # TODO: could make the actual value of the attribute subject_key = source_curie object_key = target_curie # now actually add the virtual edges in id = f"{relation}_{self.global_iter}" self.global_iter += 1 edge_attribute_list = [ edge_attribute, EdgeAttribute(name="is_defined_by", value=is_defined_by, type="ARAX_TYPE_PLACEHOLDER"), EdgeAttribute(name="defined_datetime", value=defined_datetime, type="metatype:Datetime"), EdgeAttribute(name="provided_by", value=provided_by, type="biolink:provided_by"), #EdgeAttribute(name="confidence", value=confidence, type="biolink:ConfidenceLevel"), #EdgeAttribute(name="weight", value=weight, type="metatype:Float") ] edge = Edge(predicate=edge_type, subject=subject_key, object=object_key, relation=relation, attributes=edge_attribute_list) edge.qedge_keys = qedge_keys self.message.knowledge_graph.edges[id] = edge # Now add a q_edge the query_graph since I've added an extra edge to the KG if added_flag: edge_type = "biolink:probably_treats" relation = parameters['virtual_relation_label'] subject_qnode_key = parameters['subject_qnode_key'] object_qnode_key = parameters['object_qnode_key'] option_group_id = ou.determine_virtual_qedge_option_group(subject_qnode_key, object_qnode_key, self.message.query_graph, self.response) q_edge = QEdge(predicate=edge_type, relation=relation, subject=subject_qnode_key, object=object_qnode_key, option_group_id=option_group_id) self.message.query_graph.edges[relation] = q_edge return self.response else: # you want to add it for each edge in the KG # iterate over KG edges, add the information try: # map curies to types curie_to_type = dict() curie_to_name = dict() for node_key, node in self.message.knowledge_graph.nodes.items(): curie_to_type[node_key] = node.category curie_to_name[node_key] = node.name # then iterate over the edges and decorate if appropriate for edge_key, edge in self.message.knowledge_graph.edges.items(): # Make sure the edge_attributes are not None if not edge.attributes: edge.attributes = [] # should be an array, but why not a list? # now go and actually get the probability source_curie = edge.subject target_curie = edge.object source_types = curie_to_type[source_curie] target_types = curie_to_type[target_curie] if (("drug" in source_types) or ("chemical_substance" in source_types) or ("biolink:Drug" in source_types) or ("biolink:ChemicalSubstance" in source_types)) and (("disease" in target_types) or ("phenotypic_feature" in target_types) or ("biolink:Disease" in target_types) or ("biolink:PhenotypicFeature" in target_types)): # loop over all pairs of equivalent curies and take the highest probability self.response.debug(f"Predicting treatment probability between {curie_to_name[source_curie]} and {curie_to_name[target_curie]}") max_probability = 0 converted_source_curie = self.convert_to_trained_curies(source_curie) if converted_source_curie is None: continue else: preferred_type = converted_source_curie['preferred_type'] if preferred_type == "drug" or preferred_type == "chemical_substance" or preferred_type == "biolink:Drug" or preferred_type == "biolink:ChemicalSubstance": converted_source_curie = converted_source_curie['preferred_curie'] else: continue converted_target_curie = self.convert_to_trained_curies(target_curie) if converted_target_curie is None: continue else: preferred_type = converted_target_curie['preferred_type'] if preferred_type == "disease" or preferred_type == "phenotypic_feature" or preferred_type == "biolink:Disease" or preferred_type == "biolink:PhenotypicFeature": converted_target_curie = converted_target_curie['preferred_curie'] else: continue if self.use_prob_db is True: probability = self.pred.get_prob_from_DTD_db(converted_source_curie, converted_target_curie) if probability is not None: if np.isfinite(probability): max_probability = probability else: probability = self.pred.prob_single(converted_source_curie, converted_target_curie) if probability is not None: probability = probability[0] if np.isfinite(probability): max_probability = probability # res = list(itertools.product(converted_source_curie, converted_target_curie)) # if len(res) != 0: # all_probabilities = self.pred.prob_all(res) # if isinstance(all_probabilities, list): # max_probability = max([value for value in all_probabilities if np.isfinite(value)]) value = max_probability #probability = self.pred.prob_single('ChEMBL:' + source_curie[22:], target_curie) # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123 #if probability and np.isfinite(probability): # finite, that's ok, otherwise, stay with default # value = probability[0] elif (("drug" in target_types) or ("chemical_substance" in target_types) or ("biolink:Drug" in target_types) or ("biolink:ChemicalSubstance" in target_types)) and (("disease" in source_types) or ("phenotypic_feature" in source_types) or ("biolink:Disease" in source_types) or ("biolink:PhenotypicFeature" in source_types)): #probability = self.pred.prob_single('ChEMBL:' + target_curie[22:], source_curie) # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123 #if probability and np.isfinite(probability): # finite, that's ok, otherwise, stay with default # value = probability[0] self.response.debug(f"Predicting treatment probability between {curie_to_name[source_curie]} and {curie_to_name[target_curie]}") max_probability = 0 converted_source_curie = self.convert_to_trained_curies(source_curie) if converted_source_curie is None: continue else: preferred_type = converted_source_curie['preferred_type'] if preferred_type == "disease" or preferred_type == "phenotypic_feature" or preferred_type == "biolink:Disease" or preferred_type == "biolink:PhenotypicFeature": converted_source_curie = converted_source_curie['preferred_curie'] else: continue converted_target_curie = self.convert_to_trained_curies(target_curie) if converted_target_curie is None: continue else: preferred_type = converted_target_curie['preferred_type'] if preferred_type == "drug" or preferred_type == "chemical_substance" or preferred_type == "biolink:Drug" or preferred_type == "biolink:ChemicalSubstance": converted_target_curie = converted_target_curie['preferred_curie'] else: continue if self.use_prob_db is True: probability = self.pred.get_prob_from_DTD_db(converted_target_curie, converted_source_curie) if probability is not None: if np.isfinite(probability): max_probability = probability else: probability = self.pred.prob_single(converted_target_curie, converted_source_curie) if probability is not None: probability = probability[0] if np.isfinite(probability): max_probability = probability # res = list(itertools.product(converted_target_curie, converted_source_curie)) # if len(res) != 0: # all_probabilities = self.pred.prob_all(res) # if isinstance(all_probabilities, list): # max_probability = max([value for value in all_probabilities if np.isfinite(value)]) value = max_probability else: continue if value != 0: edge_attribute = EdgeAttribute(type=attribute_type, name=attribute_name, value=str(value), url=url) # populate the attribute edge.attributes.append(edge_attribute) # append it to the list of attributes except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Something went wrong adding the drug disease treatment probability") else: self.response.info(f"Drug disease treatment probability successfully added to edges") return self.response
def add_qnode(self, response, input_parameters, describe=False): """ Adds a new QNode object to the QueryGraph inside the Message object :return: ARAXResponse object with execution information :rtype: ARAXResponse """ # #### Command definition for autogenerated documentation command_definition = { 'dsl_command': 'add_qnode()', 'description': """The `add_qnode` method adds an additional QNode to the QueryGraph in the Message object.""", 'parameters': { 'key': { 'is_required': False, 'examples': ['n00', 'n01'], 'default': '', 'type': 'string', 'description': """Any string that is unique among all QNode key fields, with recommended format n00, n01, n02, etc. If no value is provided, autoincrementing values beginning for n00 are used.""", }, 'id': { 'is_required': False, 'examples': ['DOID:9281', '[UniProtKB:P12345,UniProtKB:Q54321]'], 'type': 'string', 'description': 'Any compact URI (CURIE) (e.g. DOID:9281) (May also be a list like [UniProtKB:P12345,UniProtKB:Q54321])', }, 'name': { 'is_required': False, 'examples': ['hypertension', 'insulin'], 'type': 'string', 'description': 'Any name of a bioentity that will be resolved into a CURIE if possible or result in an error if not (e.g. hypertension, insulin)', }, 'category': { 'is_required': False, 'examples': ['protein', 'chemical_substance', 'disease'], 'type': 'ARAXnode', 'description': 'Any valid Translator bioentity category (e.g. protein, chemical_substance, disease)', }, 'is_set': { 'is_required': False, 'enum': ["true", "false", "True", "False", "t", "f", "T", "F"], 'examples': ['true', 'false'], 'type': 'boolean', 'description': 'If set to true, this QNode represents a set of nodes that are all in common between the two other linked QNodes (assumed to be false if not specified or value is not recognized as true/t case insensitive)' }, 'option_group_id': { 'is_required': False, 'examples': ['1', 'a', 'b2', 'option'], 'type': 'string', 'description': 'A group identifier indicating a group of nodes and edges should either all be included or all excluded. An optional match for all elements in this group. If not included Node will be treated as required.' }, } } if describe: return command_definition #### Extract the message to work on message = response.envelope.message #### Basic checks on arguments if not isinstance(input_parameters, dict): response.error("Provided parameters is not a dict", error_code="ParametersNotDict") return response #### Define a complete set of allowed parameters and their defaults parameters = { 'key': None, 'id': None, 'name': None, 'category': None, 'is_set': None, 'option_group_id': None, } #### Loop through the input_parameters and override the defaults and make sure they are allowed for key, value in input_parameters.items(): if key not in parameters: response.error(f"Supplied parameter {key} is not permitted", error_code="UnknownParameter") else: parameters[key] = value #### Check for option_group_id and is_set: if parameters['option_group_id'] is not None and parameters[ 'id'] is None and parameters['name'] is None: if parameters['is_set'] is None: parameters['is_set'] = 'true' response.warning( f"An 'option_group_id' was set to {parameters['option_group_id']}, but 'is_set' was not an included parameter. It must be true when an 'option_group_id' is given, so automatically setting to true. Avoid this warning by explictly setting to true." ) elif not (parameters['is_set'].lower() == 'true' or parameters['is_set'].lower() == 't'): response.error( f"When an 'option_group_id' is given 'is_set' must be set to true. However, supplied input for parameter 'is_set' was {parameters['is_set']}.", error_code="InputMismatch") #### Return if any of the parameters generated an error (showing not just the first one) if response.status != 'OK': return response #### Now apply the filters. Order of operations is probably quite important #### Scalar value filters probably come first like minimum_confidence, then complex logic filters #### based on edge or node properties, and then finally maximum_results response.info( f"Adding a QueryNode to Message with input parameters {parameters}" ) #### Make sure there's a query_graph already here if message.query_graph is None: message.query_graph = QueryGraph() message.query_graph.nodes = {} message.query_graph.edges = {} if message.query_graph.nodes is None: message.query_graph.nodes = {} #### Set up the NodeSynonymizer to find curies and names synonymizer = NodeSynonymizer() # Create the QNode and set the key qnode = QNode() if parameters['key'] is not None: key = parameters['key'] else: key = self.__get_next_free_node_key() if parameters['option_group_id'] is not None: qnode.option_group_id = parameters['option_group_id'] # Set the is_set parameter to what the user selected if parameters['is_set'] is not None: qnode.is_set = (parameters['is_set'].lower() == 'true' or parameters['is_set'].lower() == 't') #### If the id is specified, try to find that if parameters['id'] is not None: # If the id is a scalar then treat it here as a list of one if isinstance(parameters['id'], str): id_list = [parameters['id']] is_id_a_list = False if parameters['is_set'] is not None and qnode.is_set is True: response.error( f"Specified id '{parameters['id']}' is a scalar, but is_set=true, which doesn't make sense", error_code="IdScalarButIsSetTrue") return response # Or else set it up as a list elif isinstance(parameters['id'], list): id_list = parameters['id'] is_id_a_list = True qnode.id = [] if parameters['is_set'] is None: response.warning( f"Specified id '{parameters['id']}' is a list, but is_set was not set to true. It must be true in this context, so automatically setting to true. Avoid this warning by explictly setting to true." ) qnode.is_set = True else: if qnode.is_set == False: response.warning( f"Specified id '{parameters['id']}' is a list, but is_set=false, which doesn't make sense, so automatically setting to true. Avoid this warning by explictly setting to true." ) qnode.is_set = True # Or if it's neither a list or a string, then error out. This cannot be handled at present else: response.error( f"Specified id '{parameters['id']}' is neither a string nor a list. This cannot to handled", error_code="IdNotListOrScalar") return response # Loop over the available ids and create the list for id in id_list: response.debug(f"Looking up id {id} in NodeSynonymizer") synonymizer_results = synonymizer.get_canonical_curies( curies=[id]) # If nothing was found, we won't bail out, but rather just issue a warning that this id is suspect if synonymizer_results[id] is None: response.warning( f"A node with id {id} is not in our knowledge graph KG2, but will continue with it" ) if is_id_a_list: qnode.id.append(id) else: qnode.id = id # And if it is found, keep the same id but report the preferred id else: response.info(f"id {id} is found. Adding it to the qnode") if is_id_a_list: qnode.id.append(id) else: qnode.id = id if 'category' in parameters and parameters[ 'category'] is not None: if isinstance(parameters['category'], str): qnode.category = parameters['category'] else: qnode.category = parameters['category'][0] message.query_graph.nodes[key] = qnode return response #### If the name is specified, try to find that if parameters['name'] is not None: name = parameters['name'] response.debug( f"Looking up id for name '{name}' in NodeSynonymizer") synonymizer_results = synonymizer.get_canonical_curies( curies=[name], names=[name]) if synonymizer_results[name] is None: response.error( f"A node with name '{name}' is not in our knowledge graph", error_code="UnresolvableNodeName") return response qnode.id = synonymizer_results[name]['preferred_curie'] response.info( f"Creating QueryNode with id '{qnode.id}' for name '{name}'") if parameters['category'] is not None: qnode.category = parameters['category'] message.query_graph.nodes[key] = qnode return response #### If the category is specified, just add that category. There should be checking that it is legal. FIXME if parameters['category'] is not None: qnode.category = parameters['category'] if parameters['is_set'] is not None: qnode.is_set = (parameters['is_set'].lower() == 'true') message.query_graph.nodes[key] = qnode return response #### If we get here, it means that all three main parameters are null. Just a generic node with no category or anything. This is okay. message.query_graph.nodes[key] = qnode return response
def __init__(self, response, message, parameters): self.response = response self.message = message self.parameters = parameters self.global_iter = 0 ## check if the new model files exists in /predictor/retrain_data. If not, scp it from arax.ncats.io pathlist = os.path.realpath(__file__).split(os.path.sep) RTXindex = pathlist.index("RTX") filepath = os.path.sep.join([*pathlist[:(RTXindex + 1)], 'code', 'ARAX', 'KnowledgeSources', 'Prediction']) self.drug_label_list = ['chemicalsubstance','drug'] self.disease_label_list = ['disease','phenotypicfeature','diseaseorphenotypicfeature'] ## check if there is LogModel.pkl log_model_name = RTXConfig.log_model_path.split("/")[-1] pkl_file = f"{filepath}{os.path.sep}{log_model_name}" if os.path.exists(pkl_file): pass else: #os.system("scp [email protected]:/data/orangeboard/databases/KG2.3.4/LogModel.pkl " + pkl_file) os.system(f"scp {RTXConfig.log_model_username}@{RTXConfig.log_model_host}:{RTXConfig.log_model_path} {pkl_file}") ## check if there is GRAPH.sqlite graph_database_name = RTXConfig.graph_database_path.split("/")[-1] db_file = f"{filepath}{os.path.sep}{graph_database_name}" if os.path.exists(db_file): pass else: #os.system("scp [email protected]:/data/orangeboard/databases/KG2.3.4/GRAPH.sqlite " + db_file) os.system(f"scp {RTXConfig.graph_database_username}@{RTXConfig.graph_database_host}:{RTXConfig.graph_database_path} {db_file}") ## check if there is DTD_probability_database.db DTD_prob_db_file = f"{filepath}{os.path.sep}{RTXConfig.dtd_prob_path.split('/')[-1]}" if os.path.exists(DTD_prob_db_file): pass else: #os.system("scp [email protected]:/data/orangeboard/databases/KG2.3.4/DTD_probability_database_v1.0.db " + DTD_prob_db_file) os.system(f"scp {RTXConfig.dtd_prob_username}@{RTXConfig.dtd_prob_host}:{RTXConfig.dtd_prob_path} {DTD_prob_db_file}") # use NodeSynonymizer to replace map.txt # check if there is map.txt # map_file = f"{filepath}/map.txt" # if os.path.exists(map_file): # pass # else: # os.system("scp [email protected]:/home/ubuntu/drug_repurposing_model_retrain/map.txt " + map_file) self.use_prob_db = True if self.use_prob_db is True: try: self.pred = predictor(DTD_prob_file=DTD_prob_db_file, use_prob_db=True) except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Internal Error encountered connecting to the local DTD prediction database.") else: try: self.pred = predictor(model_file=pkl_file, use_prob_db=False) except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Internal Error encountered connecting to the local LogModel.pkl file.") try: self.pred.import_file(None, graph_database=db_file) except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Internal Error encountered connecting to the local graph database file.") # with open(map_file, 'r') as infile: # map_file_content = infile.readlines() # map_file_content.pop(0) ## remove title # self.known_curies = set(line.strip().split('\t')[0] for line in map_file_content) self.synonymizer = NodeSynonymizer()
def main(): parser = argparse.ArgumentParser( description="Refresh DTD model and database", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--synoymizer_folder', type=str, help="Full path of folder containing NodeSynonymizer", default='~/RTX/code/ARAX/NodeSynonymizer/') parser.add_argument( '--DTD_prob_db_file', type=str, help="Full path of DTD probability database file", default= '~/work/RTX/code/ARAX/KnowledgeSources/Prediction/DTD_probability_database_v1.0_KG2.3.4.db' ) parser.add_argument( '--emb_file', type=str, help="Full path of DTD model embedding file", default= '~/work/RTX/code/ARAX/KnowledgeSources/Prediction/rel_max_v1.0_KG2.3.4.emb.gz' ) parser.add_argument( '--map_file', type=str, help="Full path of DTD model mapping file", default= '~/work/RTX/code/ARAX/KnowledgeSources/Prediction/map_v1.0_KG2.3.4.txt' ) parser.add_argument( '--output_folder', type=str, help="Full path of output folder", default='~/work/RTX/code/ARAX/KnowledgeSources/Prediction/') args = parser.parse_args() if os.path.isdir(args.synoymizer_folder): sys.path.append(args.synoymizer_folder) from node_synonymizer import NodeSynonymizer synonymizer = NodeSynonymizer() else: print(f"Error: Not found this folder: {args.synoymizer_folder}") exit(0) if os.path.isfile(args.DTD_prob_db_file): print(f'Start to refresh DTD_probability_database.db', flush=True) con = sqlite3.connect(args.DTD_prob_db_file) DTD_prob_table = pd.read_sql_query("SELECT * from DTD_PROBABILITY", con) con.close() DTD_prob_table = DTD_prob_table.apply(lambda row: [ refresh_disease(row[0], synonymizer), refresh_drug(row[1], synonymizer), row[2] ], axis=1, result_type='expand') DTD_prob_table = DTD_prob_table.dropna().reset_index(drop=True) con = sqlite3.connect( os.path.join(args.output_folder, 'DTD_probability_database_refreshed.db')) con.execute( f"CREATE TABLE DTD_PROBABILITY( disease VARCHAR(255), drug VARCHAR(255), probability FLOAT )" ) insert_command = "INSERT INTO DTD_PROBABILITY VALUES (?, ?, ?)" databasefile = list(DTD_prob_table.to_records(index=False)) print(f"INFO: Populating table", flush=True) insert_command = "INSERT INTO DTD_PROBABILITY VALUES (?, ?, ?)" batch = list(range(0, len(databasefile), 5000)) batch.append(len(databasefile)) count = 0 for i in range(len(batch)): if ((i + 1) < len(batch)): start = batch[i] end = batch[i + 1] rows = databasefile[start:end] con.executemany(insert_command, rows) con.commit() count = count + len(rows) percentage = round((count * 100.0 / len(databasefile)), 2) print(str(percentage) + "%..", end='', flush=True) print(f"INFO: Populating tables is completed", flush=True) print(f"INFO: Creating INDEXes on DTD_PROBABILITY", flush=True) con.execute( f"CREATE INDEX idx_DTD_PROBABILITY_disease ON DTD_PROBABILITY(disease)" ) con.execute( f"CREATE INDEX idx_DTD_PROBABILITY_drug ON DTD_PROBABILITY(drug)") con.commit() con.close() print(f"INFO: Creating INDEXes is completed", flush=True) else: print(f"Error: Not found this file: {args.DTD_prob_db_file}") exit(0) if os.path.isfile(args.emb_file) and os.path.isfile(args.map_file): rel_max = pd.read_csv(args.emb_file, sep=' ', skiprows=1, header=None) mapfile = pd.read_csv(args.map_file, sep='\t', header=0) merged_table = mapfile.merge(rel_max, left_on='id', right_on=0) merged_table = merged_table.loc[:, ['curie'] + list(merged_table.columns)[3:]] new_curie_ids = [ synonymizer.get_canonical_curies(curie)[curie]['preferred_curie'] if synonymizer.get_canonical_curies(curie)[curie] is not None else None for curie in list(merged_table.curie) ] graph = pd.concat( [pd.DataFrame(new_curie_ids), merged_table.iloc[:, 1:]], axis=1) graph = graph.dropna().reset_index(drop=True) con = sqlite3.connect( os.path.join(args.output_folder, 'GRAPH_refreshed.sqlite')) con.execute(f"DROP TABLE IF EXISTs GRAPH") insert_command1 = f"CREATE TABLE GRAPH(curie VARCHAR(255)" for num in range(1, graph.shape[1]): insert_command1 = insert_command1 + f", col{num} INT" insert_command1 = insert_command1 + ")" con.execute(insert_command1) con.commit() count = 0 print(f"Insert data into database", flush=True) for row in range(graph.shape[0]): count = count + 1 insert_command1 = f"INSERT INTO GRAPH" insert_command2 = f" values (" for _ in range(graph.shape[1]): insert_command2 = insert_command2 + f"?," insert_command = insert_command1 + insert_command2 + ")" insert_command = insert_command.replace(',)', ')') line = tuple(graph.loc[row, :]) con.execute(insert_command, line) if count % 5000 == 0: con.commit() percentage = int(count * 100.0 / graph.shape[0]) print(str(percentage) + "%..", end='', flush=True) con.commit() percentage = int(count * 100.0 / graph.shape[0]) print(str(percentage) + "%..", end='', flush=True) con.execute(f"CREATE INDEX idx_GRAPH_curie ON GRAPH(curie)") con.commit() con.close() print(f"INFO: Database created successfully", flush=True)
class NGDDatabaseBuilder: def __init__(self, pubmed_directory_path, is_test, live="Production"): self.RTXConfig = RTXConfiguration() self.RTXConfig.live = live ngd_filepath = os.path.sep.join([ *pathlist[:(RTXindex + 1)], 'code', 'ARAX', 'KnowledgeSources', 'NormalizedGoogleDistance' ]) self.pubmed_directory_path = pubmed_directory_path self.conceptname_to_pmids_db_path = "conceptname_to_pmids.db" self.curie_to_pmids_db_path = f"{ngd_filepath}{os.path.sep}{self.RTXConfig.curie_to_pmids_path.split('/')[-1]}" self.status = 'OK' self.synonymizer = NodeSynonymizer() self.is_test = is_test def build_conceptname_to_pmids_db(self): # This function extracts curie -> PMIDs mappings from a Pubmed XML download (saves data in a pickledb) print( f"Starting to build {self.conceptname_to_pmids_db_path} from pubmed files.." ) start = time.time() pubmed_directory = os.fsencode(self.pubmed_directory_path) all_file_names = [ os.fsdecode(file) for file in os.listdir(pubmed_directory) ] pubmed_file_names = [ file_name for file_name in all_file_names if file_name.startswith('pubmed') and file_name.endswith('.xml.gz') ] if not pubmed_file_names: print( f"ERROR: Couldn't find any PubMed XML files to scrape. Provide the path to the directory " f"containing your PubMed download as a command line argument.") self.status = 'ERROR' else: conceptname_to_pmids_map = dict() # Go through each downloaded pubmed file and build our dictionary of mappings pubmed_file_names_to_process = pubmed_file_names if not self.is_test else pubmed_file_names[: 1] for file_name in pubmed_file_names_to_process: print( f" Starting to process file '{file_name}'.. ({pubmed_file_names_to_process.index(file_name) + 1}" f" of {len(pubmed_file_names_to_process)})") file_start_time = time.time() with gzip.open(f"{self.pubmed_directory_path}/{file_name}" ) as pubmed_file: file_contents_tree = etree.parse(pubmed_file) pubmed_articles = file_contents_tree.xpath("//PubmedArticle") for article in pubmed_articles: # Link each concept name to the PMID of this article current_pmid = article.xpath( ".//MedlineCitation/PMID/text()")[0] descriptor_names = article.xpath( ".//MedlineCitation/MeshHeadingList/MeshHeading/DescriptorName/text()" ) qualifier_names = article.xpath( ".//MedlineCitation/MeshHeadingList/MeshHeading/QualifierName/text()" ) chemical_names = article.xpath( ".//MedlineCitation/ChemicalList/Chemical/NameOfSubstance/text()" ) gene_symbols = article.xpath( ".//MedlineCitation/GeneSymbolList/GeneSymbol/text()") keywords = article.xpath( ".//MedlineCitation/KeywordList/Keyword/text()") all_concept_names = descriptor_names + qualifier_names + chemical_names + gene_symbols + keywords unique_concept_names = { concept_name for concept_name in all_concept_names if concept_name } for concept_name in unique_concept_names: self._add_pmids_mapping(concept_name, current_pmid, conceptname_to_pmids_map) self._destroy_etree( file_contents_tree) # Hack around lxml memory leak print( f" took {round((time.time() - file_start_time) / 60, 2)} minutes" ) # Save the data to the PickleDB after we're done print(" Loading data into PickleDB..") conceptname_to_pmids_db = pickledb.load( self.conceptname_to_pmids_db_path, False) for concept_name, pmid_list in conceptname_to_pmids_map.items(): conceptname_to_pmids_db.set( concept_name, list({ self._create_pmid_curie_from_local_id(pmid) for pmid in pmid_list })) print(" Saving PickleDB file..") conceptname_to_pmids_db.dump() print( f"Done! Building {self.conceptname_to_pmids_db_path} took {round(((time.time() - start) / 60) / 60, 3)} hours" ) def build_curie_to_pmids_db(self): # This function creates a final sqlite database of curie->PMIDs mappings using data scraped from Pubmed AND KG2 print( f"Starting to build {self.curie_to_pmids_db_path.split(os.path.sep)[-1]}.." ) start = time.time() curie_to_pmids_map = dict() self._add_pmids_from_pubmed_scrape(curie_to_pmids_map) if self.status != 'OK': return self._add_pmids_from_kg2_edges(curie_to_pmids_map) self._add_pmids_from_kg2_nodes(curie_to_pmids_map) print( f" In the end, found PMID lists for {len(curie_to_pmids_map)} (canonical) curies" ) self._save_data_in_sqlite_db(curie_to_pmids_map) print( f"Done! Building {self.curie_to_pmids_db_path.split(os.path.sep)[-1]} took {round((time.time() - start) / 60)} minutes." ) # Helper methods def _add_pmids_from_kg2_edges(self, curie_to_pmids_map): print(f" Getting PMIDs from edges in KG2 neo4j..") edge_query = f"match (n)-[e]->(m) where e.publications is not null and e.publications <> '[]' " \ f"return distinct n.id, m.id, e.publications{' limit 100' if self.is_test else ''}" edge_results = self._run_cypher_query(edge_query, 'KG2') print(f" Processing results..") node_ids = {result['n.id'] for result in edge_results }.union(result['m.id'] for result in edge_results) canonicalized_curies_dict = self._get_canonicalized_curies_dict( list(node_ids)) for result in edge_results: canonicalized_node_ids = { canonicalized_curies_dict[result['n.id']], canonicalized_curies_dict[result['m.id']] } pmids = self._extract_and_format_pmids(result['e.publications']) if pmids: # Sometimes publications list includes only non-PMID identifiers (like ISBN) for canonical_curie in canonicalized_node_ids: self._add_pmids_mapping(canonical_curie, pmids, curie_to_pmids_map) def _add_pmids_from_kg2_nodes(self, curie_to_pmids_map): print(f" Getting PMIDs from nodes in KG2 neo4j..") node_query = f"match (n) where n.publications is not null and n.publications <> '[]' " \ f"return distinct n.id, n.publications{' limit 100' if self.is_test else ''}" node_results = self._run_cypher_query(node_query, 'KG2') print(f" Processing results..") node_ids = {result['n.id'] for result in node_results} canonicalized_curies_dict = self._get_canonicalized_curies_dict( list(node_ids)) for result in node_results: canonical_curie = canonicalized_curies_dict[result['n.id']] pmids = self._extract_and_format_pmids(result['n.publications']) if pmids: # Sometimes publications list includes only non-PMID identifiers (like ISBN) self._add_pmids_mapping(canonical_curie, pmids, curie_to_pmids_map) def _add_pmids_from_pubmed_scrape(self, curie_to_pmids_map): # Load the data from the first half of the build process (scraping pubmed) print( f" Loading pickle DB containing pubmed scrapings ({self.conceptname_to_pmids_db_path}).." ) conceptname_to_pmids_db = pickledb.load( self.conceptname_to_pmids_db_path, False) if not conceptname_to_pmids_db.getall(): print( f"ERROR: {self.conceptname_to_pmids_db_path} must exist to do a partial build. Use --full or locate " f"that file.") self.status = 'ERROR' return # Get canonical curies for all of the concept names in our big pubmed pickleDB using the NodeSynonymizer concept_names = list(conceptname_to_pmids_db.getall()) print( f" Sending NodeSynonymizer.get_canonical_curies() a list of {len(concept_names)} concept names.." ) canonical_curies_dict = self.synonymizer.get_canonical_curies( names=concept_names) print( f" Got results back from NodeSynonymizer. (Returned dict contains {len(canonical_curies_dict)} keys.)" ) # Map all of the concept names scraped from pubmed to curies if canonical_curies_dict: recognized_concepts = { concept for concept in canonical_curies_dict if canonical_curies_dict.get(concept) } print( f" NodeSynonymizer recognized {round((len(recognized_concepts) / len(concept_names)) * 100)}% of " f"concept names scraped from pubmed.") # Store which concept names the NodeSynonymizer didn't know about, for learning purposes unrecognized_concepts = set(canonical_curies_dict).difference( recognized_concepts) with open('unrecognized_pubmed_concept_names.txt', 'w+') as unrecognized_concepts_file: unrecognized_concepts_file.write(f"{unrecognized_concepts}") print( f" Unrecognized concept names were written to 'unrecognized_pubmed_concept_names.txt'." ) # Map the canonical curie for each recognized concept to the concept's PMID list print(f" Mapping canonical curies to PMIDs..") for concept_name in recognized_concepts: canonical_curie = canonical_curies_dict[concept_name].get( 'preferred_curie') pmids_for_this_concept = conceptname_to_pmids_db.get( concept_name) self._add_pmids_mapping(canonical_curie, pmids_for_this_concept, curie_to_pmids_map) print( f" Mapped {len(curie_to_pmids_map)} canonical curies to PMIDs based on pubmed scrapings." ) else: print(f"ERROR: NodeSynonymizer didn't return anything!") self.status = 'ERROR' def _save_data_in_sqlite_db(self, curie_to_pmids_map): print(" Loading data into sqlite database..") # Remove any preexisting version of this database if os.path.exists(self.curie_to_pmids_db_path): os.remove(self.curie_to_pmids_db_path) connection = sqlite3.connect(self.curie_to_pmids_db_path) cursor = connection.cursor() cursor.execute("CREATE TABLE curie_to_pmids (curie TEXT, pmids TEXT)") cursor.execute( "CREATE UNIQUE INDEX unique_curie ON curie_to_pmids (curie)") print(f" Gathering row data..") rows = [[ curie, json.dumps( list( filter(None, {self._get_local_id_as_int(pmid) for pmid in pmids}))) ] for curie, pmids in curie_to_pmids_map.items()] rows_in_chunks = self._divide_list_into_chunks(rows, 5000) print(f" Inserting row data into database..") for chunk in rows_in_chunks: cursor.executemany( f"INSERT INTO curie_to_pmids (curie, pmids) VALUES (?, ?)", chunk) connection.commit() # Log how many rows we've added in the end (for debugging purposes) cursor.execute(f"SELECT COUNT(*) FROM curie_to_pmids") count = cursor.fetchone()[0] print(f" Done saving data in sqlite; database contains {count} rows.") cursor.close() def _get_canonicalized_curies_dict(self, curies: List[str]) -> Dict[str, str]: print( f" Sending a batch of {len(curies)} curies to NodeSynonymizer.get_canonical_curies()" ) canonicalized_nodes_info = self.synonymizer.get_canonical_curies( curies) canonicalized_curies_dict = dict() for input_curie, preferred_info_dict in canonicalized_nodes_info.items( ): if preferred_info_dict: canonicalized_curies_dict[ input_curie] = preferred_info_dict.get( 'preferred_curie', input_curie) else: canonicalized_curies_dict[input_curie] = input_curie print(f" Got results back from synonymizer") return canonicalized_curies_dict def _extract_and_format_pmids(self, publications: List[str]) -> List[str]: pmids = { publication_id for publication_id in publications if publication_id.upper().startswith('PMID') } # Make sure all PMIDs are given in same format (e.g., PMID:18299583 rather than PMID18299583) formatted_pmids = [ self._create_pmid_curie_from_local_id( pmid.replace('PMID', '').replace(':', '')) for pmid in pmids ] return formatted_pmids @staticmethod def _add_pmids_mapping(key: str, value_to_append: Union[str, List[str]], mappings_dict: Dict[str, List[str]]): if key not in mappings_dict: mappings_dict[key] = [] if isinstance(value_to_append, list): mappings_dict[key] += value_to_append else: mappings_dict[key].append(value_to_append) @staticmethod def _create_pmid_curie_from_local_id(pmid): return f"PMID:{pmid}" @staticmethod def _get_local_id_as_int(curie): # Converts "PMID:1234" to 1234 curie_pieces = curie.split(":") local_id_str = curie_pieces[-1] # Remove any strange characters (like in "PMID:_19960544") stripped_id_str = "".join( [character for character in local_id_str if character.isdigit()]) return int(stripped_id_str) if stripped_id_str else None @staticmethod def _destroy_etree(file_contents_tree): # Thank you to https://stackoverflow.com/a/49139904 for this method; important to prevent memory blow-up root = file_contents_tree.getroot() element_tracker = {root: [0, None]} for element in root.iterdescendants(): parent = element.getparent() element_tracker[element] = [element_tracker[parent][0] + 1, parent] element_tracker = sorted( [(depth, parent, child) for child, (depth, parent) in element_tracker.items()], key=lambda x: x[0], reverse=True) for _, parent, child in element_tracker: if parent is None: break parent.remove(child) del file_contents_tree @staticmethod def _run_cypher_query(cypher_query: str, kg='KG2') -> List[Dict[str, any]]: rtxc = RTXConfiguration() if kg == 'KG2': rtxc.live = "KG2" try: driver = GraphDatabase.driver(rtxc.neo4j_bolt, auth=(rtxc.neo4j_username, rtxc.neo4j_password)) with driver.session() as session: query_results = session.run(cypher_query).data() driver.close() except Exception: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() print(f"Encountered an error interacting with {kg} neo4j. {tb}") return [] else: return query_results @staticmethod def _divide_list_into_chunks(input_list: List[any], chunk_size: int) -> List[List[any]]: num_chunks = len(input_list) // chunk_size if len( input_list) % chunk_size == 0 else (len(input_list) // chunk_size) + 1 start_index = 0 stop_index = chunk_size all_chunks = [] for num in range(num_chunks): chunk = input_list[start_index:stop_index] if stop_index <= len( input_list) else input_list[start_index:] all_chunks.append(chunk) start_index += chunk_size stop_index += chunk_size return all_chunks