def remove_overly_general_nodes(canonicalized_nodes_dict: Dict[str, Dict[str, any]], canonicalized_edges_dict: Dict[str, Dict[str, any]], biolink_version: str, is_test: bool) -> Tuple[Dict[str, Dict[str, any]], Dict[str, Dict[str, any]]]: logging.info(f"Removing overly general nodes from the graph..") bh = BiolinkHelper(biolink_version) # Remove all nodes that have a biolink category as an equivalent identifier, as well as a few others all_biolink_categories = set(bh.get_descendants("biolink:NamedThing")) overly_general_curies = {"MESH:D010361", "SO:0001217", "MONDO:0000001", "FMA:67257", "MESH:D002477", "MESH:D005796", "UMLS:C1257890", "UMLS:C0237401", "PR:000029067", "UMLS:C1457887", "biolink:Cohort", "UMLS:C1550655", "CHEBI:25212", "GO:0008150", "UMLS:C0029235", "LOINC:LP7790-1"}.union(all_biolink_categories) # TODO: Later use some better heuristics to identify such nodes? node_ids_to_remove = {node_id for node_id, node in canonicalized_nodes_dict.items() if set(node["equivalent_curies"]).intersection(overly_general_curies)} logging.info(f" Identified {len(node_ids_to_remove)} nodes to remove: {node_ids_to_remove}") for node_id in node_ids_to_remove: canonicalized_nodes_dict.pop(node_id, None) # Delete any now orphaned edges if not is_test: orphaned_edge_ids = {edge_id for edge_id, edge in canonicalized_edges_dict.items() if edge["subject"] not in canonicalized_nodes_dict or edge["object"] not in canonicalized_nodes_dict} logging.info(f" Deleting {len(orphaned_edge_ids)} edges that were orphaned by the above steps..") for edge_id in orphaned_edge_ids: canonicalized_edges_dict.pop(edge_id, None) logging.info(f"Done removing overly general nodes: resulting KG2c now has {len(canonicalized_nodes_dict)} nodes " f"and {len(canonicalized_edges_dict)} edges") return canonicalized_nodes_dict, canonicalized_edges_dict
def record_meta_kg_info(is_test: bool): kg2c_lite_file_name = f"kg2c_lite{'_test' if is_test else ''}.json" meta_kg_file_name = f"kg2c_meta_kg{'_test' if is_test else ''}.json" sqlite_file_name = f"kg2c{'_test' if is_test else ''}.sqlite" fda_approved_file_name = f"fda_approved_drugs{'_test' if is_test else ''}.pickle" # Initiate a BiolinkHelper for the proper Biolink model version with open("kg2c_config.json") as config_file: config_info = json.load(config_file) bh = BiolinkHelper(config_info["biolink_version"]) start = time.time() # Load the 'lite' KG2c file into node/edge dictionaries with open(f"{KG2C_DIR}/{kg2c_lite_file_name}", "r") as input_kg_file: logging.info(f"Loading {kg2c_lite_file_name} into memory..") kg2c_dict = json.load(input_kg_file) nodes_by_id = {node["id"]: node for node in kg2c_dict["nodes"]} edges_by_id = {edge["id"]: edge for edge in kg2c_dict["edges"]} del kg2c_dict # Add the 'expanded' node labels (including category ancestors) into the node dictionary expanded_labels_property_name = "expanded_labels" for node in nodes_by_id.values(): node[expanded_labels_property_name] = bh.get_ancestors( node["all_categories"], include_mixins=True) build_meta_kg(nodes_by_id, edges_by_id, meta_kg_file_name, bh, is_test) add_neighbor_counts_to_sqlite(nodes_by_id, edges_by_id, sqlite_file_name, expanded_labels_property_name, is_test) add_category_counts_to_sqlite(nodes_by_id, sqlite_file_name, expanded_labels_property_name) generate_fda_approved_drugs_pickle(edges_by_id, fda_approved_file_name) logging.info( f"Recording meta KG info took {round((time.time() - start) / 60, 1)} minutes." )
def create_kg2c_tsv_files(canonicalized_nodes_dict: Dict[str, Dict[str, any]], canonicalized_edges_dict: Dict[str, Dict[str, any]], biolink_version: str, is_test: bool): bh = BiolinkHelper(biolink_version) # Convert array fields into the format neo4j wants and do some final processing array_node_columns = _get_array_properties("node").union({"node_labels"}) array_edge_columns = _get_array_properties("edge") node_labels_property = _get_node_labels_property() for canonicalized_node in canonicalized_nodes_dict.values(): canonicalized_node['node_labels'] = bh.get_ancestors(canonicalized_node[node_labels_property], include_mixins=True) for list_node_property in array_node_columns: canonicalized_node[list_node_property] = _convert_list_to_string_encoded_format(canonicalized_node[list_node_property]) for canonicalized_edge in canonicalized_edges_dict.values(): if not is_test: # Make sure we don't have any orphan edges assert canonicalized_edge['subject'] in canonicalized_nodes_dict assert canonicalized_edge['object'] in canonicalized_nodes_dict for list_edge_property in array_edge_columns: canonicalized_edge[list_edge_property] = _convert_list_to_string_encoded_format(canonicalized_edge[list_edge_property]) canonicalized_edge['predicate_for_conversion'] = canonicalized_edge['predicate'] canonicalized_edge['subject_for_conversion'] = canonicalized_edge['subject'] canonicalized_edge['object_for_conversion'] = canonicalized_edge['object'] # Finally dump all our nodes/edges into TSVs (formatted for neo4j) logging.info(f" Creating TSVs for Neo4j..") _write_list_to_neo4j_ready_tsv(list(canonicalized_nodes_dict.values()), "nodes_c", is_test) _write_list_to_neo4j_ready_tsv(list(canonicalized_edges_dict.values()), "edges_c", is_test)
def __init__(self, log: ARAXResponse = ARAXResponse()): self.meta_map_path = f"{os.path.dirname(os.path.abspath(__file__))}/meta_map_v2.pickle" self.timeout_record_path = f"{os.path.dirname(os.path.abspath(__file__))}/kp_timeout_record.pickle" self.log = log self.all_kps = eu.get_all_kps() self.timeout_record = self._load_timeout_record() self.meta_map = self._load_meta_map() self.biolink_helper = BiolinkHelper()
def build_meta_kg(nodes_by_id: Dict[str, Dict[str, any]], edges_by_id: Dict[str, Dict[str, any]], meta_kg_file_name: str, biolink_helper: BiolinkHelper, is_test: bool): logging.info(f"Building meta KG..") logging.info(" Gathering all meta triples..") meta_triples = set() for edge in edges_by_id.values(): subject_node_id = edge["subject"] object_node_id = edge["object"] if not is_test or (subject_node_id in nodes_by_id and object_node_id in nodes_by_id): subject_node = nodes_by_id[subject_node_id] object_node = nodes_by_id[object_node_id] subject_categories = biolink_helper.add_conflations( subject_node["all_categories"]) object_categories = biolink_helper.add_conflations( object_node["all_categories"]) predicate = edge["predicate"] for subject_category in subject_categories: for object_category in object_categories: meta_triples.add( (subject_category, predicate, object_category)) kg2_infores_curie = "infores:rtx-kg2" standard_attributes = [{ "attribute_type_id": "biolink:knowledge_source", "attribute_source": kg2_infores_curie }, { "attribute_type_id": "biolink:aggregator_knowledge_source", "attribute_source": kg2_infores_curie }] meta_edges = [{ "subject": triple[0], "predicate": triple[1], "object": triple[2], "attributes": standard_attributes } for triple in meta_triples] logging.info(f" Created {len(meta_edges)} meta edges") logging.info(" Gathering all meta nodes..") with open(f"{KG2C_DIR}/equivalent_curies.pickle", "rb") as equiv_curies_file: equivalent_curies_dict = pickle.load(equiv_curies_file) meta_nodes = defaultdict(lambda: defaultdict(lambda: set())) for node_id, node in nodes_by_id.items(): equivalent_curies = equivalent_curies_dict.get(node_id, [node_id]) prefixes = {curie.split(":")[0] for curie in equivalent_curies} categories = biolink_helper.add_conflations(node["category"]) for category in categories: meta_nodes[category]["id_prefixes"].update(prefixes) logging.info(f" Created {len(meta_nodes)} meta nodes") logging.info(" Saving meta KG to JSON file..") meta_kg = {"nodes": meta_nodes, "edges": meta_edges} with open(f"{KG2C_DIR}/{meta_kg_file_name}", "w+") as meta_kg_file: json.dump(meta_kg, meta_kg_file, default=serialize_with_sets, indent=2)
def _get_supported_prefixes(self, categories: List[str], kp: str) -> Set[str]: bh = BiolinkHelper() categories_with_descendants = bh.get_descendants( eu.convert_to_list(categories), include_mixins=False) supported_prefixes = { prefix.upper() for category in categories_with_descendants for prefix in self.meta_map[kp]["prefixes"].get(category, set()) } return supported_prefixes
def __init__(self, response_object: ARAXResponse): self.response = response_object self.biolink_helper = BiolinkHelper() self.kg2_infores_curie = "infores:rtx-kg2" self.max_allowed_edges = 1000000 self.max_edges_per_input_curie = 1000 self.curie_batch_size = 100
def check_for_canonical_predicates( kg: QGOrganizedKnowledgeGraph, kp_name: str, log: ARAXResponse) -> QGOrganizedKnowledgeGraph: non_canonical_predicates_used = set() biolink_helper = BiolinkHelper() for qedge_id, edges in kg.edges_by_qg_id.items(): for edge in edges.values(): canonical_predicate = biolink_helper.get_canonical_predicates( edge.predicate)[0] if canonical_predicate != edge.predicate: non_canonical_predicates_used.add(edge.predicate) _ = flip_edge(edge, canonical_predicate) if non_canonical_predicates_used: log.warning( f"{kp_name}: Found edges in {kp_name}'s answer that use non-canonical " f"predicates: {non_canonical_predicates_used}. I corrected these.") return kg
def __init__(self, response, message, params): self.response = response self.message = message self.parameters = params self.who_knows_about_what = { 'COHD': [ 'small_molecule', 'phenotypic_feature', 'disease', 'drug', 'biolink:SmallMolecule', 'biolink:PhenotypicFeature', 'biolink:Disease', 'biolink:Drug' ] } # FIXME: replace this with information about the KP's, KS's, and their API's self.node_curie_to_type = dict() self.biolink_helper = BiolinkHelper() self.global_iter = 0 try: self.cohdIndex = COHDIndex() except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error( f"Internal Error encountered connecting to the local COHD database." )
def __init__(self): self.location = os.path.dirname(os.path.abspath(__file__)) requests_cache.install_cache(self.location + '/category_manager.cache') self.bh = BiolinkHelper() self.categories = {'ancestors': {}, 'relevant_categories': {}} self.approved_conflations = { 'biolink:Gene': ['biolink:Protein'], 'biolink:Protein': ['biolink:Gene'], # Decided 2021-07-28 mini-hackathon that we may be best off NOT doing conflation here. Just use # ChemicalEntity to refer to everything # #'biolink:Drug': [ 'biolink:ChemicalEntity', 'biolink:MolecularEntity', 'biolink:SmallMolecule' ], #'biolink:ChemicalEntity': [ 'biolink:Drug', 'biolink:MolecularEntity', 'biolink:SmallMolecule' ], #'biolink:SmallMolecule': [ 'biolink:Drug', 'biolink:MolecularEntity', 'biolink:ChemicalEntity' ], #'biolink:MolecularEntity': [ 'biolink:Drug', 'biolink:SmallMolecule', 'biolink:ChemicalEntity' ], 'biolink:Disease': ['biolink:PhenotypicFeature'], 'biolink:PhenotypicFeature': ['biolink:Disease'], 'biolink:DiseaseOrPhenotypicFeature': ['biolink:Disease', 'biolink:PhenotypicFeature'], }
class OverlayClinicalInfo: #### Constructor def __init__(self, response, message, params): self.response = response self.message = message self.parameters = params self.who_knows_about_what = { 'COHD': [ 'small_molecule', 'phenotypic_feature', 'disease', 'drug', 'biolink:SmallMolecule', 'biolink:PhenotypicFeature', 'biolink:Disease', 'biolink:Drug' ] } # FIXME: replace this with information about the KP's, KS's, and their API's self.node_curie_to_type = dict() self.biolink_helper = BiolinkHelper() self.global_iter = 0 try: self.cohdIndex = COHDIndex() except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error( f"Internal Error encountered connecting to the local COHD database." ) def decorate(self): """ Main decorator: looks at parameters and figures out which subroutine to farm out to :param parameters: :return: response object """ # First, make a dictionary between node curie and type to make sure we're only looking at edges we can handle self.response.info( "Converting CURIE identifiers to human readable names") try: for key, node in self.message.knowledge_graph.nodes.items(): self.node_curie_to_type[ key] = node.categories # WARNING: this is a list except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Something went wrong when converting names") return self.response parameters = self.parameters if 'paired_concept_frequency' in parameters: if parameters['paired_concept_frequency'] == 'true': self.paired_concept_frequency() # TODO: should I return the response and merge, or is it passed by reference and just return at the end? if 'associated_concept_freq' in parameters: if parameters['associated_concept_freq'] == 'true': #self.associated_concept_freq() # TODO: make this function, and all the other COHD functions too pass if 'chi_square' in parameters: if parameters['chi_square'] == 'true': self.chi_square( ) # TODO: make this function, and all the other COHD functions too pass if 'observed_expected_ratio' in parameters: if parameters['observed_expected_ratio'] == 'true': self.observed_expected_ratio( ) # TODO: make this function, and all the other COHD functions too pass if 'relative_frequency' in parameters: if parameters['relative_frequency'] == 'true': #self.associated_concept_freq() # TODO: make this function, and all the other COHD functions too pass return self.response def in_common(self, list1, list2): """ Helper function that returns true iff list1 and list2 have any elements in common :param list1: a list of strings (intended to be biolink node types) :param list2: another list of strings (intended to be biolink node types) :return: True/False if they share an element in common """ if set(list1).intersection(set(list2)): return True else: return False def make_edge_attribute_from_curies(self, subject_curie, object_curie, subject_name="", object_name="", default=0., name=""): """ Generic function to make an edge attribute :subject_curie: CURIE of the subject node for the edge under consideration :object_curie: CURIE of the object node for the edge under consideration :subject_name: text name of the subject node (in case the KP doesn't understand the CURIE) :object: text name of the object node (in case the KP doesn't understand the CURIE) :default: default value of the edge attribute :name: name of the KP functionality you want to apply """ try: # edge attributes name = name type = "EDAM:data_0951" url = "http://cohd.smart-api.info/" value = default node_curie_to_type = self.node_curie_to_type subject_type = node_curie_to_type[subject_curie] object_type = node_curie_to_type[object_curie] # figure out which knowledge provider to use # TODO: should handle this in a more structured fashion, does there exist a standardized KP API format? KP_to_use = None for KP in self.who_knows_about_what: # see which KP's can label both subjects of information if self.in_common( self.biolink_helper.get_descendants( subject_type, include_mixins=False), self.who_knows_about_what[KP]) and self.in_common( self.biolink_helper.get_descendants( object_type, include_mixins=False), self.who_knows_about_what[KP]): KP_to_use = KP if KP_to_use == 'COHD': self.response.debug( f"Querying Columbia Open Health data for info about {subject_name} and {object_name}" ) # convert CURIE to OMOP identifiers # subject_OMOPs = [str(x['omop_standard_concept_id']) for x in COHD.get_xref_to_OMOP(subject_curie, 1)] res = self.mapping_curie_to_omop_ids.get(subject_curie, []) if len(res) != 0: subject_OMOPs = res else: subject_OMOPs = [] # object_OMOPs = [str(x['omop_standard_concept_id']) for x in COHD.get_xref_to_OMOP(object_curie, 1)] res = self.mapping_curie_to_omop_ids.get(object_curie, []) if len(res) != 0: object_OMOPs = res else: object_OMOPs = [] # for domain in ["Condition", "Drug", "Procedure"]: # subject_OMOPs.update([str(x['concept_id']) for x in COHD.find_concept_ids(subject_name, domain=domain, dataset_id=3)]) # object_OMOPs.update([str(x['concept_id']) for x in COHD.find_concept_ids(object_name, domain=domain, dataset_id=3)]) ################################################# # FIXME: this was the old way # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs # if subject_curie.split('.')[0] == 'CHEMBL': # subject_OMOPs = [str(x['concept_id']) for x in # COHD.find_concept_ids(subject_name, domain="Drug", dataset_id=3)] # if object_curie.split('.')[0] == 'CHEMBL': # object_OMOPs = [str(x['concept_id']) for x in # COHD.find_concept_ids(object_name, domain="Drug", dataset_id=3)] # uniquify everything # subject_OMOPs = list(set(subject_OMOPs)) # object_OMOPs = list(set(object_OMOPs)) # Decide how to handle the response from the KP if name == 'paired_concept_frequency': # sum up all frequencies #TODO check with COHD people to see if this is kosher frequency = default # for (omop1, omop2) in itertools.product(subject_OMOPs, object_OMOPs): # freq_data_list = self.cohdIndex.get_paired_concept_freq(omop1, omop2, 3) # use the hierarchical dataset # if len(freq_data_list) != 0: # freq_data = freq_data_list[0] # temp_value = freq_data['concept_frequency'] # if temp_value > frequency: # frequency = temp_value omop_pairs = [ f"{omop1}_{omop2}" for (omop1, omop2) in itertools.product( subject_OMOPs, object_OMOPs) ] if len(omop_pairs) != 0: res = self.cohdIndex.get_paired_concept_freq( concept_id_pair=omop_pairs, dataset_id=3) # use the hierarchical dataset if len(res) != 0: maximum_concept_frequency = res[0][ 'concept_frequency'] # the result returned from get_paired_concept_freq was sorted by decreasing order frequency = maximum_concept_frequency # decorate the edges value = frequency elif name == 'observed_expected_ratio': # should probably take the largest obs/exp ratio # TODO: check with COHD people to see if this is kosher # FIXME: the ln_ratio can be negative, so I should probably account for this, but the object model doesn't like -np.inf value = float( "-inf" ) # FIXME: unclear in object model if attribute type dictates value type, or if value always needs to be a string ############################### # The following code was an experiment to see if it would speed things up, leaving it out for now since it's difficult to quantify if it does speed things up given the cacheing #if len(subject_OMOPs) < len(object_OMOPs): # for omop1 in subject_OMOPs: # omop_to_ln_ratio = dict() # response = COHD.get_obs_exp_ratio(omop1, domain="", dataset_id=3) # use the hierarchical dataset # if response: # for res in response: # omop_to_ln_ratio[str(res['concept_id_2'])] = res['ln_ratio'] # for omop2 in object_OMOPs: # if omop2 in omop_to_ln_ratio: # temp_value = omop_to_ln_ratio[omop2] # if temp_value > value: # value = temp_value #else: # for omop1 in object_OMOPs: # omop_to_ln_ratio = dict() # response = COHD.get_obs_exp_ratio(omop1, domain="", dataset_id=3) # use the hierarchical dataset # if response: # for res in response: # omop_to_ln_ratio[str(res['concept_id_2'])] = res['ln_ratio'] # for omop2 in subject_OMOPs: # if omop2 in omop_to_ln_ratio: # temp_value = omop_to_ln_ratio[omop2] # if temp_value > value: # value = temp_value ################################### # for (omop1, omop2) in itertools.product(subject_OMOPs, object_OMOPs): # #print(f"{omop1},{omop2}") # response = self.cohdIndex.get_obs_exp_ratio(omop1, concept_id_2=omop2, domain="", dataset_id=3) # use the hierarchical dataset # # response is a list, since this function is overloaded and can omit concept_id_2, take the first element # if response and 'ln_ratio' in response[0]: # temp_val = response[0]['ln_ratio'] # if temp_val > value: # value = temp_val omop_pairs = [ f"{omop1}_{omop2}" for (omop1, omop2) in itertools.product( subject_OMOPs, object_OMOPs) ] if len(omop_pairs) != 0: res = self.cohdIndex.get_obs_exp_ratio( concept_id_pair=omop_pairs, domain="", dataset_id=3) # use the hierarchical dataset if len(res) != 0: maximum_ln_ratio = res[0][ 'ln_ratio'] # the result returned from get_paired_concept_freq was sorted by decreasing order value = maximum_ln_ratio elif name == 'chi_square': value = float("inf") # for (omop1, omop2) in itertools.product(subject_OMOPs, object_OMOPs): # response = self.cohdIndex.get_chi_square(omop1, concept_id_2=omop2, domain="", dataset_id=3) # use the hierarchical dataset # # response is a list, since this function is overloaded and can omit concept_id_2, take the first element # if response and 'p-value' in response[0]: # temp_val = response[0]['p-value'] # if temp_val < value: # looking at p=values, so lower is better # value = temp_val omop_pairs = [ f"{omop1}_{omop2}" for (omop1, omop2) in itertools.product( subject_OMOPs, object_OMOPs) ] if len(omop_pairs) != 0: res = self.cohdIndex.get_chi_square( concept_id_pair=omop_pairs, domain="", dataset_id=3) # use the hierarchical dataset if len(res) != 0: minimum_pvalue = res[0][ 'p-value'] # the result returned from get_paired_concept_freq was sorted by decreasing order value = minimum_pvalue # create the edge attribute edge_attribute = EdgeAttribute( attribute_type_id=type, original_attribute_name=name, value=str(value), value_url=url ) # populate the edge attribute # FIXME: unclear in object model if attribute type dictates value type, or if value always needs to be a string return edge_attribute else: return None except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error( f"Something went wrong when adding the edge attribute from {KP_to_use}." ) def add_virtual_edge(self, name="", default=0.): """ Generic function to add a virtual edge to the KG an QG :name: name of the functionality of the KP to use """ parameters = self.parameters subject_curies_to_decorate = set() object_curies_to_decorate = set() curies_to_names = dict( ) # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs # identify the nodes that we should be adding virtual edges for for key, node in self.message.knowledge_graph.nodes.items(): if hasattr(node, 'qnode_keys'): if parameters['subject_qnode_key'] in node.qnode_keys: subject_curies_to_decorate.add(key) curies_to_names[ key] = node.name # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs if parameters['object_qnode_key'] in node.qnode_keys: object_curies_to_decorate.add(key) curies_to_names[ key] = node.name # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs added_flag = False # check to see if any edges where added # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute ## call COHD api one time to save time curies_to_decorate = set() curies_to_decorate.update(subject_curies_to_decorate) curies_to_decorate.update(object_curies_to_decorate) self.mapping_curie_to_omop_ids = self.cohdIndex.get_concept_ids( curies_to_decorate) for (subject_curie, object_curie) in itertools.product(subject_curies_to_decorate, object_curies_to_decorate): # create the edge attribute if it can be edge_attribute = self.make_edge_attribute_from_curies( subject_curie, object_curie, subject_name=curies_to_names[subject_curie], object_name=curies_to_names[object_curie], default=default, name=name) if edge_attribute: added_flag = True # make the edge, add the attribute # edge properties now = datetime.now() edge_type = f"biolink:has_real_world_evidence_of_association_with" qedge_keys = [parameters['virtual_relation_label']] relation = parameters['virtual_relation_label'] is_defined_by = "ARAX" defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S") provided_by = "infores:arax" confidence = None weight = None # TODO: could make the actual value of the attribute subject_key = subject_curie object_key = object_curie # now actually add the virtual edges in id = f"{relation}_{self.global_iter}" # ensure the id is unique # might need to change after expand is implemented for TRAPI 1.0 while id in self.message.knowledge_graph.edges: id = f"{relation}_{self.global_iter}.{random.randint(10**(9-1), (10**9)-1)}" self.global_iter += 1 edge_attribute_list = [ edge_attribute, EdgeAttribute( original_attribute_name="virtual_relation_label", value=relation, attribute_type_id="biolink:Unknown"), #EdgeAttribute(original_attribute_name="is_defined_by", value=is_defined_by, attribute_type_id="biolink:Unknown"), EdgeAttribute(original_attribute_name="defined_datetime", value=defined_datetime, attribute_type_id="metatype:Datetime"), EdgeAttribute( original_attribute_name="provided_by", value=provided_by, attribute_type_id="biolink:aggregator_knowledge_source", attribute_source=provided_by, value_type_id="biolink:InformationResource"), EdgeAttribute( original_attribute_name=None, value=True, attribute_type_id="biolink:computed_value", attribute_source="infores:arax-reasoner-ara", value_type_id="metatype:Boolean", value_url=None, description= "This edge is a container for a computed value between two nodes that is not directly attachable to other edges." ) #EdgeAttribute(name="confidence", value=confidence, type="biolink:ConfidenceLevel"), #EdgeAttribute(name="weight", value=weight, type="metatype:Float"), #EdgeAttribute(name="qedge_ids", value=qedge_ids) ] # edge = Edge(id=id, type=edge_type, relation=relation, subject_key=subject_key, # object_key=object_key, # is_defined_by=is_defined_by, defined_datetime=defined_datetime, # provided_by=provided_by, # confidence=confidence, weight=weight, attributes=[edge_attribute], qedge_ids=qedge_ids) edge = Edge(predicate=edge_type, subject=subject_key, object=object_key, attributes=edge_attribute_list) edge.qedge_keys = qedge_keys self.message.knowledge_graph.edges[id] = edge if self.message.results is not None and len( self.message.results) > 0: ou.update_results_with_overlay_edge( subject_knode_key=subject_key, object_knode_key=object_key, kedge_key=id, message=self.message, log=self.response) # Now add a q_edge the query_graph since I've added an extra edge to the KG if added_flag: edge_type = f"biolink:has_real_world_evidence_of_association_with" relation = parameters['virtual_relation_label'] qedge_keys = [parameters['virtual_relation_label']] subject_qnode_key = parameters['subject_qnode_key'] object_qnode_key = parameters['object_qnode_key'] option_group_id = ou.determine_virtual_qedge_option_group( subject_qnode_key, object_qnode_key, self.message.query_graph, self.response) # q_edge = QEdge(id=relation, type=edge_type, relation=relation, # subject_key=subject_qnode_key, object_key=object_qnode_key, # option_group_id=option_group_id) # TODO: ok to make the id and type the same thing? q_edge = QEdge(predicates=edge_type, subject=subject_qnode_key, object=object_qnode_key, option_group_id=option_group_id) q_edge.relation = relation self.message.query_graph.edges[relation] = q_edge def add_all_edges(self, name="", default=0.): curies_to_names = dict() all_curie_set = set() for key, node in self.message.knowledge_graph.nodes.items(): curies_to_names[key] = node.name all_curie_set.add(key) self.mapping_curie_to_omop_ids = self.cohdIndex.get_concept_ids( all_curie_set) for edge in self.message.knowledge_graph.edges.values(): if not edge.attributes: # populate if not already there edge.attributes = [] subject_curie = edge.subject object_curie = edge.object edge_attribute = self.make_edge_attribute_from_curies( subject_curie, object_curie, subject_name=curies_to_names[subject_curie], object_name=curies_to_names[object_curie], default=default, name=name ) # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs if edge_attribute: # make sure an edge attribute was actually created edge.attributes.append(edge_attribute) def paired_concept_frequency(self, default=0): """ calulate paired concept frequency. Retrieves observed clinical frequencies of a pair of concepts. :return: response """ parameters = self.parameters self.response.debug("Computing paired concept frequencies.") self.response.info( "Overlaying paired concept frequencies utilizing Columbia Open Health Data. This calls an external knowledge provider and may take a while" ) # Now add the edges or virtual edges try: if 'virtual_relation_label' in parameters: if 'subject_qnode_key' in parameters and 'object_qnode_key' in parameters: self.add_virtual_edge(name="paired_concept_frequency", default=default) else: seen_node_pairs = set() qgraph_edges = copy.deepcopy( list(self.response.envelope.message.query_graph.edges. values())) for query_edge in qgraph_edges: current_subject_qnode_key = query_edge.subject current_object_qnode_key = query_edge.object if current_subject_qnode_key < current_object_qnode_key: qnode_key_pair = (current_subject_qnode_key, current_object_qnode_key) else: qnode_key_pair = (current_object_qnode_key, current_subject_qnode_key) # FW: check if we have already added an edge for this pair if qnode_key_pair in seen_node_pairs: pass else: seen_node_pairs.add(qnode_key_pair) parameters[ 'subject_qnode_key'] = current_subject_qnode_key parameters[ 'object_qnode_key'] = current_object_qnode_key self.add_virtual_edge( name="paired_concept_frequency", default=default) parameters.pop('subject_qnode_key') parameters.pop('object_qnode_key') else: # otherwise, just add to existing edges in the KG self.add_all_edges(name="paired_concept_frequency", default=default) except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error( f"Something went wrong when overlaying clinical info") def observed_expected_ratio(self, default=0): """ Returns the natural logarithm of the ratio between the observed count and expected count. Expected count is calculated from the single concept frequencies and assuming independence between the concepts. Results are returned as maximum over all ln_ratios matching to OMOP concept id. """ parameters = self.parameters self.response.debug("Computing observed expected ratios.") self.response.info( "Overlaying observed expected ratios utilizing Columbia Open Health Data. This calls an external knowledge provider and may take a while" ) # Now add the edges or virtual edges try: if 'virtual_relation_label' in parameters: if 'subject_qnode_key' in parameters and 'object_qnode_key' in parameters: self.add_virtual_edge(name="observed_expected_ratio", default=default) else: seen_node_pairs = set() qgraph_edges = copy.deepcopy( list(self.response.envelope.message.query_graph.edges. values())) for query_edge in qgraph_edges: current_subject_qnode_key = query_edge.subject current_object_qnode_key = query_edge.object if current_subject_qnode_key < current_object_qnode_key: qnode_key_pair = (current_subject_qnode_key, current_object_qnode_key) else: qnode_key_pair = (current_object_qnode_key, current_subject_qnode_key) # FW: check if we have already added an edge for this pair if qnode_key_pair in seen_node_pairs: pass else: seen_node_pairs.add(qnode_key_pair) parameters[ 'subject_qnode_key'] = current_subject_qnode_key parameters[ 'object_qnode_key'] = current_object_qnode_key self.add_virtual_edge( name="observed_expected_ratio", default=default) parameters.pop('subject_qnode_key') parameters.pop('object_qnode_key') else: # otherwise, just add to existing edges in the KG self.add_all_edges(name="observed_expected_ratio", default=default) except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error( f"Something went wrong when overlaying clinical info") def chi_square(self, default=float("inf")): """ Returns the chi-square statistic and p-value between pairs of concepts. Results are returned in descending order of the chi-square statistic. Note that due to large sample sizes, the chi-square can become very large. The expected frequencies for the chi-square analysis are calculated based on the single concept frequencies and assuming independence between concepts. P-value is calculated with 1 DOF. """ parameters = self.parameters self.response.debug("Computing Chi square p-values.") self.response.info( "Overlaying Chi square p-values utilizing Columbia Open Health Data. This calls an external knowledge provider and may take a while" ) # Now add the edges or virtual edges try: if 'virtual_relation_label' in parameters: if 'subject_qnode_key' in parameters and 'object_qnode_key' in parameters: self.add_virtual_edge(name="chi_square", default=default) else: seen_node_pairs = set() qgraph_edges = copy.deepcopy( list(self.response.envelope.message.query_graph.edges. values())) for query_edge in qgraph_edges: current_subject_qnode_key = query_edge.subject current_object_qnode_key = query_edge.object if current_subject_qnode_key < current_object_qnode_key: qnode_key_pair = (current_subject_qnode_key, current_object_qnode_key) else: qnode_key_pair = (current_object_qnode_key, current_subject_qnode_key) # FW: check if we have already added an edge for this pair if qnode_key_pair in seen_node_pairs: pass else: seen_node_pairs.add(qnode_key_pair) parameters[ 'subject_qnode_key'] = current_subject_qnode_key parameters[ 'object_qnode_key'] = current_object_qnode_key self.add_virtual_edge(name="chi_square", default=default) parameters.pop('subject_qnode_key') parameters.pop('object_qnode_key') else: # otherwise, just add to existing edges in the KG self.add_all_edges(name="chi_square", default=default) except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error( f"Something went wrong when overlaying clinical info")
class KPSelector: def __init__(self, log: ARAXResponse = ARAXResponse()): self.meta_map_path = f"{os.path.dirname(os.path.abspath(__file__))}/meta_map_v2.pickle" self.timeout_record_path = f"{os.path.dirname(os.path.abspath(__file__))}/kp_timeout_record.pickle" self.log = log self.all_kps = eu.get_all_kps() self.timeout_record = self._load_timeout_record() self.meta_map = self._load_meta_map() self.biolink_helper = BiolinkHelper() def get_kps_for_single_hop_qg(self, qg: QueryGraph) -> Optional[Set[str]]: """ This function returns the names of the KPs that say they can answer the given one-hop query graph (based on the categories/predicates the QG uses). """ qedge_key = next(qedge_key for qedge_key in qg.edges) qedge = qg.edges[qedge_key] self.log.debug(f"Selecting KPs to use for qedge {qedge_key}") # confirm that the qg is one hop if len(qg.edges) > 1: self.log.error( f"Query graph can only have one edge, but instead has {len(qg.edges)}.", error_code="UnexpectedQG") return None # isolate possible subject predicate object from qg sub_categories = set( self.biolink_helper.get_descendants( qg.nodes[qedge.subject].categories)) obj_categories = set( self.biolink_helper.get_descendants( qg.nodes[qedge.object].categories)) predicates = set(self.biolink_helper.get_descendants(qedge.predicates)) symmetrical_predicates = set( filter(self.biolink_helper.is_symmetric, predicates)) # use metamap to check kp for predicate triple accepting_kps = set() for kp in self.meta_map: if self._triple_is_in_meta_map(kp, sub_categories, predicates, obj_categories): accepting_kps.add(kp) # account for symmetrical predicates by checking if kp accepts with swapped sub and obj categories elif self._triple_is_in_meta_map(kp, obj_categories, symmetrical_predicates, sub_categories): accepting_kps.add(kp) else: self.log.update_query_plan( qedge_key, kp, "Skipped", "MetaKG indicates this qedge is unsupported") kps_missing_meta_info = self.all_kps.difference(set(self.meta_map)) for missing_kp in kps_missing_meta_info: self.log.update_query_plan(qedge_key, missing_kp, "Skipped", "No MetaKG info available") return accepting_kps def kp_accepts_single_hop_qg(self, qg: QueryGraph, kp: str) -> Optional[bool]: """ This function determines whether a KP can answer a given one-hop query based on the categories/predicates used in the query graph. """ self.log.debug( f"Verifying that {kp} can answer this kind of one-hop query") # Confirm that the qg is one-hop if len(qg.edges) > 1: self.log.error( f"Query graph can only have one edge, but instead has {len(qg.edges)}.", error_code="UnexpectedQG") return None qedge = list(qg.edges.values())[0] sub_categories = set( self.biolink_helper.get_descendants( qg.nodes[qedge.subject].categories)) obj_categories = set( self.biolink_helper.get_descendants( qg.nodes[qedge.object].categories)) predicates = set(self.biolink_helper.get_descendants(qedge.predicates)) kp_accepts = self._triple_is_in_meta_map(kp, sub_categories, predicates, obj_categories) # account for symmetrical predicates by checking if kp accepts with swapped sub and obj categories symmetrical_predicates = set( filter(self.biolink_helper.is_symmetric, predicates)) kp_accepts = kp_accepts or self._triple_is_in_meta_map( kp, obj_categories, symmetrical_predicates, sub_categories) return kp_accepts def get_desirable_equivalent_curies(self, curies: List[str], categories: Optional[List[str]], kp: str) -> List[str]: """ For each input curie, this function returns an equivalent curie(s) that uses a prefix the KP supports. """ self.log.debug( f"{kp}: Converting curies in the QG to kinds that {kp} can answer") if not self.meta_map.get(kp): self.log.warning( f"{kp}: Somehow missing meta info for {kp}. Cannot do curie prefix conversion; will send " f"curies as they are.") return curies elif not self.meta_map[kp].get("prefixes"): self.log.warning( f"{kp}: No supported prefix info is available for {kp}. Will send curies as they are." ) return curies else: supported_prefixes = self._get_supported_prefixes( eu.convert_to_list(categories), kp) self.log.debug( f"{kp}: Prefixes {kp} supports for categories {categories} (and descendants) are: " f"{supported_prefixes}") converted_curies = set() unsupported_curies = set() synonyms_dict = eu.get_curie_synonyms_dict(curies) # Convert each input curie to a preferred, supported prefix for input_curie, equivalent_curies in synonyms_dict.items(): input_curie_prefix = self._get_uppercase_prefix(input_curie) supported_equiv_curies_by_prefix = defaultdict(set) for curie in equivalent_curies: prefix = self._get_uppercase_prefix(curie) if prefix in supported_prefixes: supported_equiv_curies_by_prefix[prefix].add(curie) if supported_equiv_curies_by_prefix: # Grab equivalent curies with the same prefix as the input curie, if available if input_curie_prefix in supported_equiv_curies_by_prefix: curies_to_send = supported_equiv_curies_by_prefix[ input_curie_prefix] # Otherwise pick any supported curie prefix present else: curies_to_send = next( curie_set for curie_set in supported_equiv_curies_by_prefix.values()) converted_curies = converted_curies.union(curies_to_send) else: unsupported_curies.add(input_curie) if unsupported_curies: self.log.warning( f"{kp}: Could not find curies with prefixes {kp} prefers for these curies: " f"{unsupported_curies}; will not send these to KP") return list(converted_curies) # returns True if at least one possible triple exists in the KP's meta map def _triple_is_in_meta_map(self, kp: str, subject_categories: Set[str], predicates: Set[str], object_categories: Set[str]) -> bool: kp_meta_map = self.meta_map.get(kp) if not kp_meta_map: if kp not in self.all_kps: self.log.error( f"{kp} does not seem to be a valid KP for ARAX. Valid KPs are: {self.all_kps}", error_code="InvalidKP") else: self.log.warning(f"Somehow missing meta info for {kp}.") return False else: predicates_map = kp_meta_map["predicates"] # handle potential emptiness of sub, obj, predicate lists if not subject_categories: # any subject subject_categories = set(predicates_map.keys()) if not object_categories: # any object object_set = set() _ = [ object_set.add(obj) for obj_dict in predicates_map.values() for obj in obj_dict.keys() ] object_categories = object_set any_predicate = False if predicates or kp == "NGD" else True # handle combinations of subject and objects using cross product qg_sub_obj_dict = defaultdict(lambda: set()) for sub, obj in list(product(subject_categories, object_categories)): qg_sub_obj_dict[sub].add(obj) # check for subjects kp_allowed_subs = set(predicates_map.keys()) accepted_subs = kp_allowed_subs.intersection( set(qg_sub_obj_dict.keys())) # check for objects for sub in accepted_subs: kp_allowed_objs = set(predicates_map[sub].keys()) accepted_objs = kp_allowed_objs.intersection( qg_sub_obj_dict[sub]) if len(accepted_objs) > 0: # check predicates for obj in accepted_objs: if any_predicate or predicates.intersection( predicates_map[sub][obj]): return True return False def _load_meta_map(self): # This function loads the meta map and updates it as needed meta_map_file = pathlib.Path(self.meta_map_path) one_day_ago = datetime.now() - timedelta(hours=24) if not meta_map_file.exists(): self.log.debug(f"Creating local copy of meta map for all KPs") meta_map = self._refresh_meta_map() elif datetime.fromtimestamp( meta_map_file.stat().st_mtime) < one_day_ago: self.log.debug(f"Doing a refresh of local meta map for all KPs") meta_map = self._refresh_meta_map() else: self.log.debug( f"Loading meta map (already exists and isn't due for a refresh)" ) with open(self.meta_map_path, "rb") as map_file: meta_map = pickle.load(map_file) # Check for any missing KPs missing_kps = self.all_kps.difference(set(meta_map)) if missing_kps: self.log.debug(f"Missing meta info for {missing_kps}") meta_map = self._refresh_meta_map(missing_kps, meta_map) # Make sure the map doesn't contain any 'stale' KPs stale_kps = set(meta_map).difference(self.all_kps) if stale_kps: for stale_kp in stale_kps: self.log.debug( f"Detected a stale KP in meta map ({stale_kp}) - deleting it" ) del meta_map[stale_kp] with open(self.meta_map_path, "wb") as map_file: pickle.dump(meta_map, map_file) # Save these changes return meta_map def _refresh_meta_map(self, kps: Optional[Set[str]] = None, meta_map: Optional[Dict[str, dict]] = None): # Create an up to date version of the meta map kps_to_update = kps if kps else self.all_kps if not meta_map: # Load whatever pre-existing meta-map we might already have (could use this info in case an API fails) meta_map_file = pathlib.Path(self.meta_map_path) if meta_map_file.exists(): with open(self.meta_map_path, "rb") as existing_meta_map_file: meta_map = pickle.load(existing_meta_map_file) else: meta_map = dict() # Then (try to) get updated meta info from each KP ten_minutes_ago = datetime.now() - timedelta(minutes=10) non_functioning_kps = [ kp for kp in kps_to_update if self.timeout_record.get(kp) and self.timeout_record[kp] > ten_minutes_ago ] if non_functioning_kps: self.log.debug( f"Not trying to grab meta info for {non_functioning_kps} because they timed out " f"within the last 10 minutes") functioning_kps_to_update = set(kps_to_update).difference( set(non_functioning_kps)) for kp in functioning_kps_to_update: kp_endpoint = eu.get_kp_endpoint_url(kp) if kp_endpoint: try: self.log.debug(f"Getting meta info from {kp}") with requests_cache.disabled(): kp_response = requests.get( f"{kp_endpoint}/meta_knowledge_graph", timeout=10) except requests.exceptions.Timeout: self.log.warning( f"Timed out when trying to hit {kp}'s /meta_knowledge_graph endpoint " f"(waited 10 seconds)") self.timeout_record[kp] = datetime.now() except Exception: self.log.warning( f"Ran into a problem getting {kp}'s meta info") else: if kp_response.status_code == 200: kp_meta_kg = kp_response.json() meta_map[kp] = { "predicates": self._convert_to_meta_map(kp_meta_kg), "prefixes": { category: meta_node["id_prefixes"] for category, meta_node in kp_meta_kg["nodes"].items() } } else: self.log.warning( f"Unable to access {kp}'s /meta_knowledge_graph endpoint (returned status of " f"{kp_response.status_code})") elif kp == "infores:arax-drug-treats-disease": meta_map[kp] = { "predicates": self._get_dtd_meta_map(), "prefixes": dict() } elif kp == "infores:arax-normalized-google-distance": # This is just a placeholder; not really used for KP selection predicates = { "biolink:NamedThing": { "biolink:NamedThing": {"biolink:has_normalized_google_distance_with"} } } meta_map[kp] = {"predicates": predicates, "prefixes": dict()} # Save our big combined metamap to a local json file with open(self.meta_map_path, "wb") as map_file: pickle.dump(meta_map, map_file) with open(self.timeout_record_path, "wb") as timeout_file: pickle.dump(self.timeout_record, timeout_file) return meta_map @staticmethod def _convert_to_meta_map(kp_meta_kg: dict) -> dict: kp_meta_map = dict() for meta_edge in kp_meta_kg["edges"]: subject_category = meta_edge["subject"] object_category = meta_edge["object"] predicate = meta_edge["predicate"] if subject_category not in kp_meta_map: kp_meta_map[subject_category] = dict() if object_category not in kp_meta_map[subject_category]: kp_meta_map[subject_category][object_category] = set() kp_meta_map[subject_category][object_category].add(predicate) return kp_meta_map @staticmethod def _get_dtd_meta_map(): dtd_predicates = {"biolink:treats", "biolink:treated_by"} drug_ish_dict = { "biolink:Drug": dtd_predicates, "biolink:SmallMolecule": dtd_predicates } disease_ish_dict = { "biolink:Disease": dtd_predicates, "biolink:PhenotypicFeature": dtd_predicates, "biolink:DiseaseOrPhenotypicFeature": dtd_predicates } dtd_meta_map = { "biolink:Drug": disease_ish_dict, "biolink:SmallMolecule": disease_ish_dict, "biolink:Disease": drug_ish_dict, "biolink:PhenotypicFeature": drug_ish_dict, "biolink:DiseaseOrPhenotypicFeature": drug_ish_dict } return dtd_meta_map def _load_timeout_record(self) -> Dict[str, datetime]: self.log.debug(f"Loading record of KP timeouts") timeout_record_file = pathlib.Path(self.timeout_record_path) if not timeout_record_file.exists(): return dict() else: with open(self.timeout_record_path, "rb") as timeout_file: return pickle.load(timeout_file) def make_qg_use_supported_prefixes( self, qg: QueryGraph, kp_name: str, log: ARAXResponse) -> Optional[QueryGraph]: for qnode_key, qnode in qg.nodes.items(): if qnode.ids: if kp_name == "infores:rtx-kg2": # Just convert them into canonical curies qnode.ids = eu.get_canonical_curies_list(qnode.ids, log) else: # Otherwise figure out which kind of curies KPs want categories = eu.convert_to_list(qnode.categories) supported_prefixes = self._get_supported_prefixes( categories, kp_name) used_prefixes = { self._get_uppercase_prefix(curie) for curie in qnode.ids } # Only convert curie(s) if any use an unsupported prefix if used_prefixes.issubset(supported_prefixes): self.log.debug( f"{kp_name}: All {qnode_key} curies use prefix(es) {kp_name} supports; no " f"conversion necessary") else: self.log.debug( f"{kp_name}: One or more {qnode_key} curies use a prefix {kp_name} doesn't " f"support; will convert these") converted_curies = self.get_desirable_equivalent_curies( qnode.ids, qnode.categories, kp_name) if converted_curies: log.debug( f"{kp_name}: Converted {qnode_key}'s {len(qnode.ids)} curies to a list of " f"{len(converted_curies)} curies tailored for {kp_name}" ) qnode.ids = converted_curies else: log.info( f"{kp_name} cannot answer the query because no equivalent curies were found " f"with prefixes it supports for qnode {qnode_key}. Original curies were: " f"{qnode.ids}") return None return qg @staticmethod def _get_uppercase_prefix(curie: str) -> str: return curie.split(":")[0].upper() def _get_supported_prefixes(self, categories: List[str], kp: str) -> Set[str]: bh = BiolinkHelper() categories_with_descendants = bh.get_descendants( eu.convert_to_list(categories), include_mixins=False) supported_prefixes = { prefix.upper() for category in categories_with_descendants for prefix in self.meta_map[kp]["prefixes"].get(category, set()) } return supported_prefixes