class BiolinkModel: root_type = 'biolink:NamedThing' def __init__(self, bl_version='1.5.0'): self.bl_url = f'https://raw.githubusercontent.com/biolink/biolink-model/{bl_version}/biolink-model.yaml' self.toolkit = Toolkit(self.bl_url) """ Programmatic model of Biolink. """ def to_camel_case(self, snake_str): """ Convert a snake case string to camel case. """ components = snake_str.split('_') return ''.join(x.title() for x in components) def get_class(self, name): """ Get a Python class from a string name. """ return getattr(sys.modules["biolink.model"], name) def is_derived(self, a_class_name, classes): """ Return true if the class derives from any of the provided classes. """ for c in classes: if isinstance(self.get_class(self.to_camel_case(a_class_name)), c): return True return False def find_biolink_leaves(self, biolink_concepts): """ Given a list of biolink concepts, returns the leaves removing any parent concepts. :param biolink_concepts: list of biolink concepts :return: leave concepts. """ ancestry_set = set() all_mixins_in_tree = set() all_concepts = set(biolink_concepts) # Keep track of things like "MacromolecularMachine" in current datasets # @TODO remove this and make nodes as errors unknown_elements = set() for x in all_concepts: current_element = self.toolkit.get_element(x) mixins = set() if current_element: if 'mixins' in current_element and len( current_element['mixins']): for m in current_element['mixins']: mixins.add(self.toolkit.get_element(m).class_uri) else: unknown_elements.add(x) ancestors = set( self.toolkit.get_ancestors(x, reflexive=False, formatted=True)) ancestry_set = ancestry_set.union(ancestors) all_mixins_in_tree = all_mixins_in_tree.union(mixins) leaf_set = all_concepts - ancestry_set - all_mixins_in_tree - unknown_elements return leaf_set def get_leaf_class(self, names): """ Return the leaf classes in the provided list of names. """ leaves = list(self.find_biolink_leaves(names)) return leaves[0]
def test_ancestors(): toolkit = Toolkit() assert 'related to' in toolkit.get_ancestors('causes') assert 'biolink:related_to' in toolkit.get_ancestors('causes', formatted=True) assert 'named thing' in toolkit.get_ancestors('gene') assert 'biolink:NamedThing' in toolkit.get_ancestors('gene', formatted=True) assert 'causes' in toolkit.get_ancestors('causes') assert 'causes' in toolkit.get_ancestors('causes', reflexive=True) assert 'causes' not in toolkit.get_ancestors('causes', reflexive=False) assert 'biolink:causes' in toolkit.get_ancestors('causes', reflexive=True, formatted=True) assert 'drug exposure' in toolkit.get_ancestors('drug intake', reflexive=True)
def test_descendants(): toolkit = Toolkit() assert 'causes' in toolkit.get_descendants('related to') assert 'interacts with' in toolkit.get_descendants('related to') assert 'gene' in toolkit.get_descendants('named thing') assert 'phenotypic feature' in toolkit.get_descendants('named thing') assert 'biolink:PhenotypicFeature' in toolkit.get_descendants( 'named thing', formatted=True) assert 'genomic entity' in toolkit.get_ancestors('genomic entity') assert 'genomic entity' in toolkit.get_ancestors('genomic entity', reflexive=True) assert 'genomic entity' not in toolkit.get_ancestors('genomic entity', reflexive=False) assert 'biolink:GenomicEntity' in toolkit.get_ancestors('gene', formatted=True) assert 'gross anatomical structure' in toolkit.get_ancestors( 'tissue', reflexive=True) assert 'molecular activity_has output' not in toolkit.get_descendants( 'molecular activity', reflexive=True) assert 'molecular activity_has output' not in toolkit.get_descendants( 'has output', reflexive=True)
class NodeFactory: def __init__(self, label_dir): #self.url_base = 'http://arrival.edc.renci.org:32511/bl' self.url_base = 'https://bl-lookup-sri.renci.org/bl' self.toolkit = Toolkit( 'https://raw.githubusercontent.com/biolink/biolink-model/1.6.1/biolink-model.yaml' ) self.ancestor_map = {} self.prefix_map = {} self.ignored_prefixes = set() self.extra_labels = {} self.label_dir = label_dir def get_ancestors(self, input_type): if input_type in self.ancestor_map: return self.ancestor_map[input_type] a = self.toolkit.get_ancestors(input_type) ancs = [self.toolkit.get_element(ai)['class_uri'] for ai in a] if input_type not in ancs: ancs = [input_type] + ancs self.ancestor_map[input_type] = ancs return ancs def get_prefixes(self, input_type): if input_type in self.prefix_map: return self.prefix_map[input_type] url = f'{self.url_base}/{input_type}' response = requests.get(url) try: j = response.json() prefs = j['id_prefixes'] except: #this is a mega hack to deal with the taxon change prefs = ['NCBITaxon', 'MESH'] #The pref are in a particular order, but apparently it can have dups (ugh) newprefs = [''] for pref in prefs: if not pref == newprefs[-1]: newprefs.append(pref) prefs = newprefs[1:] self.prefix_map[input_type] = prefs return prefs def make_json_id(self, input): if isinstance(input, LabeledID): if input.label is not None and input.label != '': return {'identifier': input.identifier, 'label': input.label} return {'identifier': input.identifier} return {'identifier': input} def clean_list(self, input_identifiers): #Sometimes we end up with something like [(HP:123,'name'),HP:123,UMLS:3445] Clean up cleanup = defaultdict(list) for x in list(input_identifiers): if isinstance(x, LabeledID): cleanup[x.identifier].append(x) else: cleanup[x].append(x) cleaned = [] for v in cleanup.values(): if len(v) == 1: cleaned.append(v[0]) else: #Originally, we were just trying to get the LabeledID. But sometimes we get more than one, so len(v) # can be more than two. wrote = False for vi in v: if isinstance(vi, LabeledID): cleaned.append(vi) wrote = True break if not wrote: print(input_identifiers) exit() return cleaned def load_extra_labels(self, prefix): labelfname = os.path.join(self.label_dir, prefix, 'labels') lbs = {} if os.path.exists(labelfname): with open(labelfname, 'r') as inf: for line in inf: x = line.strip().split('\t') lbs[x[0]] = x[1] self.extra_labels[prefix] = lbs def apply_labels(self, input_identifiers, labels): #Originally we needed to clean up the identifer lists, because there would be both labeledids and # string ids and we had to reconcile them. # But now, we only allow regular ids in the list, and now we need to turn some of them into labeled ids for output labeled_list = [] for iid in input_identifiers: if isinstance(iid, LabeledID): print('LabeledID dont belong here, pass in labels seperately', iid) exit() if iid in labels: labeled_list.append( LabeledID(identifier=iid, label=labels[iid])) else: prefix = Text.get_prefix(iid) if prefix not in self.extra_labels: self.load_extra_labels(prefix) if iid in self.extra_labels[prefix]: labeled_list.append( LabeledID(identifier=iid, label=self.extra_labels[prefix][iid])) else: labeled_list.append(iid) return labeled_list def create_node(self, input_identifiers, node_type, labels={}): #This is where we will normalize, i.e. choose the best id, and add types in accord with BL. #we should also include provenance and version information for the node set build. ancestors = self.get_ancestors(node_type) #ancestors.reverse() prefixes = self.get_prefixes(node_type) if len(input_identifiers) == 0: return None if len(input_identifiers) > 1000: print('this seems like a lot') print(len(input_identifiers)) cleaned = self.apply_labels(input_identifiers, labels) try: idmap = defaultdict(list) for i in list(cleaned): idmap[Text.get_curie(i).upper()].append(i) except AttributeError: print('something very bad') print(input_identifiers) print(len(input_identifiers)) for i in list(input_identifiers): print(i) print(type(i)) print(Text.get_curie(i)) print(Text.get_curie(i).upper()) exit() identifiers = [] accepted_ids = set() #Converting identifiers from LabeledID to dicts #In order to be consistent from run to run, we need to worry about the # case where e.g. there are 2 UMLS id's and UMLS is the preferred pref. # We're going to choose the canonical ID here just by sorting the N . for p in prefixes: pupper = p.upper() if pupper in idmap: newids = [] for v in idmap[pupper]: newid = Text.recurie(v, p) jid = self.make_json_id(newid) newids.append((jid['identifier'], jid)) accepted_ids.add(v) newids.sort() identifiers += [nid[1] for nid in newids] #Warn if we have prefixes that we're ignoring for k, vals in idmap.items(): for v in vals: if v not in accepted_ids and ( k, node_type) not in self.ignored_prefixes: print( f'Ignoring prefix {k} for type {node_type}, identifier {v}' ) self.ignored_prefixes.add((k, node_type)) if len(identifiers) == 0: return None best_id = identifiers[0]['identifier'] # identifiers is in preferred order, so choose the first non-empty label to be the node label labels = list( filter(lambda x: len(x) > 0, [l['label'] for l in identifiers if 'label' in l])) label = None if len(labels) > 0: label = labels[0] node = { 'id': { 'identifier': best_id, }, 'equivalent_identifiers': identifiers, 'type': ancestors } if label is not None: node['id']['label'] = label return node
class _GraphInterface: def __init__(self, host, port, auth): self.driver = Neo4jHTTPDriver(host=host, port=port, auth=auth) self.schema = None self.summary = None self.meta_kg = None self.bl_version = config.get('BL_VERSION', '1.5.0') self.bl_url = f'https://raw.githubusercontent.com/biolink/biolink-model/{self.bl_version}/biolink-model.yaml' self.toolkit = Toolkit(self.bl_url) def find_biolink_leaves(self, biolink_concepts: list): """ Given a list of biolink concepts, returns the leaves removing any parent concepts. :param biolink_concepts: list of biolink concepts :return: leave concepts. """ ancestry_set = set() all_mixins_in_tree = set() all_concepts = set(biolink_concepts) # Keep track of things like "MacromolecularMachine" in current datasets. unknown_elements = set() for x in all_concepts: current_element = self.toolkit.get_element(x) mixins = set() if current_element: if 'mixins' in current_element and len( current_element['mixins']): for m in current_element['mixins']: mixins.add(self.toolkit.get_element(m).class_uri) else: unknown_elements.add(x) ancestors = set( self.toolkit.get_ancestors(x, reflexive=False, formatted=True)) ancestry_set = ancestry_set.union(ancestors) all_mixins_in_tree = all_mixins_in_tree.union(mixins) leaf_set = all_concepts - ancestry_set - all_mixins_in_tree - unknown_elements return leaf_set def invert_predicate(self, biolink_predicate): """Given a biolink predicate, find its inverse""" element = self.toolkit.get_element(biolink_predicate) if element is None: return None # If its symmetric if 'symmetric' in element and element.symmetric: return biolink_predicate # if neither symmetric nor an inverse is found if 'inverse' not in element or not element['inverse']: return None # if an inverse is found return self.toolkit.get_element(element['inverse']).slot_uri def get_schema(self): """ Gets the schema of the graph. To be used by. Also generates graph summary :return: Dict of structure source label as outer most keys, target labels as inner keys and list of predicates as value. :rtype: dict """ self.schema_raw_result = {} if self.schema is None: query = """ MATCH (a)-[x]->(b) WHERE not a:Concept and not b:Concept RETURN DISTINCT labels(a) as source_labels, type(x) as predicate, labels(b) as target_labels """ logger.info( f"starting query {query} on graph... this might take a few" ) result = self.driver.run_sync(query) logger.info(f"completed query, preparing initial schema") structured = self.convert_to_dict(result) self.schema_raw_result = structured schema_bag = {} # permute source labels and target labels array # replacement for unwind for previous cypher structured_expanded = [] for triplet in structured: # Since there are some nodes in data currently just one label ['biolink:NamedThing'] # This filter is to avoid that scenario. # @TODO need to remove this filter when data build # avoids adding nodes with single ['biolink:NamedThing'] labels. filter_named_thing = lambda x: list( filter(lambda y: y != 'biolink:NamedThing', x)) source_labels, predicate, target_labels =\ self.find_biolink_leaves(filter_named_thing(triplet['source_labels'])), triplet['predicate'], \ self.find_biolink_leaves(filter_named_thing(triplet['target_labels'])) for source_label in source_labels: for target_label in target_labels: structured_expanded.append({ 'source_label': source_label, 'target_label': target_label, 'predicate': predicate }) structured = structured_expanded for triplet in structured: subject = triplet['source_label'] predicate = triplet['predicate'] objct = triplet['target_label'] if subject not in schema_bag: schema_bag[subject] = {} if objct not in schema_bag[subject]: schema_bag[subject][objct] = [] if predicate not in schema_bag[subject][objct]: schema_bag[subject][objct].append(predicate) # If we invert the order of the nodes we also have to invert the predicate inverse_predicate = self.invert_predicate(predicate) if inverse_predicate is not None and \ inverse_predicate not in schema_bag.get(objct,{}).get(subject,[]): # create the list if empty if objct not in schema_bag: schema_bag[objct] = {} if subject not in schema_bag[objct]: schema_bag[objct][subject] = [] schema_bag[objct][subject].append(inverse_predicate) self.schema = schema_bag logger.info("schema done.") if not self.summary: query = """ MATCH (c) RETURN DISTINCT labels(c) as types, count(c) as count """ logger.info(f'generating graph summary: {query}') raw = self.convert_to_dict(self.driver.run_sync(query)) summary = {} for node in raw: labels = node['types'] count = node['count'] query = f""" MATCH (:{':'.join(labels)})-[e]->(b) WITH DISTINCT e , b RETURN type(e) as edge_types, count(e) as edge_counts, labels(b) as target_labels """ raw = self.convert_to_dict(self.driver.run_sync(query)) summary_key = ':'.join(labels) summary[summary_key] = {'nodes_count': count} for row in raw: target_key = ':'.join(row['target_labels']) edge_name = row['edge_types'] edge_count = row['edge_counts'] summary[summary_key][target_key] = summary[ summary_key].get(target_key, {}) summary[summary_key][target_key][ edge_name] = edge_count self.summary = summary logger.info( f'generated summary for {len(summary)} node types.') return self.schema async def get_mini_schema(self, source_id, target_id): """ Given either id of source and/or target returns predicates that relate them. And their possible labels. :param source_id: :param target_id: :return: """ source_id_syntaxed = f"{{id: \"{source_id}\"}}" if source_id else '' target_id_syntaxed = f"{{id: \"{target_id}\"}}" if target_id else '' query = f""" MATCH (a{source_id_syntaxed})-[x]->(b{target_id_syntaxed}) WITH [la in labels(a) where la <> 'Concept'] as source_label, [lb in labels(b) where lb <> 'Concept'] as target_label, type(x) as predicate RETURN DISTINCT source_label, predicate, target_label """ response = await self.driver.run(query) response = self.convert_to_dict(response) return response async def get_node(self, node_type: str, curie: str) -> list: """ Returns a node that matches curie as its ID. :param node_type: Type of the node. :type node_type:str :param curie: Curie. :type curie: str :return: value of the node in neo4j. :rtype: list """ query = f"MATCH (c:`{node_type}`{{id: '{curie}'}}) return c" response = await self.driver.run(query) data = response.get('results', [{}])[0].get('data', []) ''' data looks like [ {'row': [{...node data..}], 'meta': [{...}]}, {'row': [{...node data..}], 'meta': [{...}]}, {'row': [{...node data..}], 'meta': [{...}]} ] ''' rows = [] if len(data): from functools import reduce rows = reduce(lambda x, y: x + y.get('row', []), data, []) return rows async def get_single_hops(self, source_type: str, target_type: str, curie: str) -> list: """ Returns a triplets of source to target where source id is curie. :param source_type: Type of the source node. :type source_type: str :param target_type: Type of target node. :type target_type: str :param curie: Curie of source node. :type curie: str :return: list of triplets where each item contains source node, edge, target. :rtype: list """ query = f'MATCH (c:`{source_type}`{{id: \'{curie}\'}})-[e]->(b:`{target_type}`) return distinct c , e, b' response = await self.driver.run(query) rows = list( map(lambda data: data['row'], response['results'][0]['data'])) query = f'MATCH (c:`{source_type}`{{id: \'{curie}\'}})<-[e]-(b:`{target_type}`) return distinct b , e, c' response = await self.driver.run(query) rows += list( map(lambda data: data['row'], response['results'][0]['data'])) return rows async def run_cypher(self, cypher: str, **kwargs) -> list: """ Runs cypher directly. :param cypher: cypher query. :type cypher: str :return: unprocessed neo4j response. :rtype: list """ return await self.driver.run(cypher, **kwargs) async def get_sample(self, node_type): """ Returns a few nodes. :param node_type: Type of nodes. :type node_type: str :return: Node dict values. :rtype: dict """ query = f"MATCH (c:{node_type}) return c limit 5" response = await self.driver.run(query) rows = response['results'][0]['data'][0]['row'] return rows async def get_examples(self, source, target=None): """ Returns an example for source node only if target is not specified, if target is specified a sample one hop is returned. :param source: Node type of the source node. :type source: str :param target: Node type of the target node. :type target: str :return: A single source node value if target is not provided. If target is provided too, a triplet. :rtype: """ if target: query = f"MATCH (source:{source})-[edge]->(target:{target}) return source, edge, target limit 1" response = await self.run_cypher(query) final = list( map(lambda data: data['row'], response['results'][0]['data'])) return final else: query = f"MATCH ({source}:{source}) return {source} limit 1" response = await self.run_cypher(query) final = list( map(lambda node: node[source], self.driver.convert_to_dict(response))) return final def get_curie_prefix_by_node_type(self, node_type): query = f""" MATCH (n:`{node_type}`) return collect(n.id) as ids """ logger.info( f"starting query {query} on graph... this might take a few") result = self.driver.run_sync(query) logger.info(f"completed query, collecting node curie prefixes") result = self.convert_to_dict(result) curie_prefixes = set() for i in result[0]['ids']: curie_prefixes.add(i.split(':')[0]) # sort according to bl model node_bl_def = self.toolkit.get_element(node_type) id_prefixes = node_bl_def.id_prefixes sorted_curie_prefixes = [ i for i in id_prefixes if i in curie_prefixes ] # gives presidence to what's in BL # add other ids even if not in BL next sorted_curie_prefixes += [ i for i in curie_prefixes if i not in sorted_curie_prefixes ] return sorted_curie_prefixes async def get_meta_kg(self): if self.meta_kg: return self.meta_kg schema = self.get_schema() nodes = {} predicates = [] for subject in schema: for object in schema[subject]: for edge_type in schema[subject][object]: predicates.append({ 'subject': subject, 'object': object, 'predicate': edge_type }) if object not in nodes: nodes[object] = { 'id_prefixes': list(self.get_curie_prefix_by_node_type(object)) } if subject not in nodes: nodes[subject] = { 'id_prefixes': list(self.get_curie_prefix_by_node_type(subject)) } self.meta_kg = {'nodes': nodes, 'edges': predicates} return self.meta_kg def supports_apoc(self): """ Returns true if apoc is supported by backend database. :return: bool true if neo4j supports apoc. """ return self.driver.check_apoc_support() async def run_apoc_cover(self, ids: list): """ Runs apoc.algo.cover on list of ids :param ids: :return: dictionary of edges and source and target nodes ids """ query = f""" MATCH (node:`biolink:NamedThing`) USING INDEX node:`biolink:NamedThing`(id) WHERE node.id in {ids} WITH collect(node) as nodes CALL apoc.algo.cover(nodes) yield rel WITH {{subject: startNode(rel).id , object: endNode(rel).id, predicate: type(rel), edge: rel }} as row return collect(row) as result """ result = self.convert_to_dict(self.driver.run_sync(query)) return result def convert_to_dict(self, result): return self.driver.convert_to_dict(result)
class WrappedBMT: """ Wrapping around some of the BMT Toolkit functions to provide case conversions to the new format """ def __init__(self): self.bmt = BMToolkit() self.all_slots = self.bmt.get_all_slots() self.all_slots_formatted = [ "biolink:" + s.replace(" ", "_") for s in self.all_slots ] self.prefix = "biolink:" self.entity_prefix_mapping = { bmt.util.format(el_name, case="pascal"): id_prefixes for el_name in self.bmt.get_all_classes() if (el := self.bmt.get_element(el_name)) is not None if (id_prefixes := getattr(el, "id_prefixes", [])) } def new_case_to_old_case(self, s): """ Convert new biolink case format (biolink:GeneOrGeneProduct) to old case format (gene or gene product) Also works with slots (biolink:related_to -> related to) """ s = s.replace(self.prefix, "") if s in self.all_slots_formatted: return s.replace("_", " ") else: return camel_to_snake(s) def old_case_to_new_case(self, s): """ Convert old case format (gene or gene product) to new biolink case format (biolink:GeneOrGeneProduct) Also works with slots (related to -> biolink:related_to) """ if s in self.all_slots: return self.prefix + s.replace(" ", "_") else: return self.prefix + snake_to_camel(s) def get_descendants(self, concept): """Wrapped BMT descendants function that does case conversions""" descendants = self.bmt.get_descendants(concept, formatted=True) if len(descendants) == 0: descendants.append(concept) return descendants def get_ancestors(self, concept, reflexive=True): """Wrapped BMT ancestors function that does case conversions""" concept_old_format = self.new_case_to_old_case(concept) ancestors_old_format = self.bmt.get_ancestors(concept_old_format, reflexive=reflexive) ancestors = [ self.old_case_to_new_case(a) for a in ancestors_old_format ] return ancestors def predicate_is_symmetric(self, predicate): """Get whether a given predicate is symmetric""" predicate_old_format = self.new_case_to_old_case(predicate) predicate_element = self.bmt.get_element(predicate_old_format) if not predicate_element: # Not in the biolink model return False return predicate_element.symmetric def predicate_inverse(self, predicate): """Get the inverse of a predicate if it has one""" predicate_old_format = self.new_case_to_old_case(predicate) predicate_element = self.bmt.get_element(predicate_old_format) if not predicate_element: # Not in the biolink model return None if predicate_element.symmetric: return predicate predicate_inverse_old_format = predicate_element.inverse if not predicate_inverse_old_format: # No inverse return None return self.old_case_to_new_case(predicate_inverse_old_format)