def test_check_all_categories2(): """ Test check_all_categories method. """ # explicitly pin the release of Biolink to a # specific one with known category ancestry get_toolkit(biolink_release="2.2.11") categories = get_biolink_ancestors("biolink:Gene") vbc, ibc, ic = check_all_categories(categories) assert len(vbc) == 4 assert len(ibc) == 0 assert ( len(ic) == 8 ) # mixins are not valid biolink categories, but they are ancestors. categories = ["biolink:NamedThing", "biolink:GeneOrGeneProduct", "biolink:Gene"] vbc, ibc, ic = check_all_categories(categories) assert len(vbc) == 2 assert len(ibc) == 0 assert len(ic) == 1 categories = ["biolink:NamedThing", "biolink:GeneOrGeneProduct", "Node"] vbc, ibc, ic = check_all_categories(categories) assert len(vbc) == 1 assert len(ibc) == 0 assert len(ic) == 2
def clean_categories(self, threashold=100): """ Removes categories and edges labels that are not from the biolink model. Adds alt_edge_label and alt_category property to hold these invalid edge labels and categories, so that the information is not lost. """ with click.progressbar(self.graph.nodes(data='category'), label='cleaning up category for nodes') as bar: for n, category in bar: if isinstance(category, list): # category is a list for c in category: if not get_toolkit().is_category(c): self.graph.node[n]['category'] = c self.graph.node[n]['category'] = 'named thing' else: # category is string # TODO: This behavior needs to be consolidated, post merge if not get_toolkit().is_category(category): self.graph.node[n]['category'] = 'named thing' self.graph.node[n]['alt_category'] = category with click.progressbar( self.graph.edges(data='edge_label'), label='cleaning up edge_label for edges') as bar: for s, o, edgelabel in bar: if not get_toolkit().is_edgelabel(edgelabel): self.graph.node[n]['edge_label'] = 'related_to' self.graph.node[n]['alt_edge_label'] = edgelabel
def __init__( self, owner, filename: str, format: str = "nt", compression: Optional[bool] = None, reify_all_edges: bool = False, **kwargs: Any, ): super().__init__(owner) if format not in {"nt"}: raise ValueError( f"Only RDF N-Triples ('nt') serialization supported.") self.DEFAULT = Namespace(self.prefix_manager.prefix_map[""]) # self.OBO = Namespace('http://purl.obolibrary.org/obo/') self.OBAN = Namespace(self.prefix_manager.prefix_map["OBAN"]) self.PMID = Namespace(self.prefix_manager.prefix_map["PMID"]) self.BIOLINK = Namespace(self.prefix_manager.prefix_map["biolink"]) self.toolkit = get_toolkit() self.reverse_predicate_mapping = {} self.property_types = get_biolink_property_types() self.cache = {} self.reify_all_edges = reify_all_edges self.reification_types = { RDF.Statement, self.BIOLINK.Association, self.OBAN.association, } if compression == "gz": f = gzip.open(filename, "wb") else: f = open(filename, "wb") self.FH = f self.encoding = "ascii"
def get_toolkit(cls) -> Toolkit: """ Get the current default Validator Toolkit """ if not cls._currently_active_toolkit: cls._currently_active_toolkit = get_toolkit() return cls._currently_active_toolkit
def __init__(self, prefix_prioritization_map: dict = None): self.toolkit = get_toolkit() self.clique_graph = nx.Graph() self.target_graph = None if prefix_prioritization_map: for x, v in prefix_prioritization_map.items(): PREFIX_PRIORITIZATION_MAP[x] = v
def get_biolink_element(self, predicate: Any) -> Optional[Element]: """ Returns a Biolink Model element for a given predicate. Parameters ---------- predicate: Any The CURIE of a predicate Returns ------- Optional[Element] The corresponding Biolink Model element """ toolkit = get_toolkit() if self.prefix_manager.is_iri(predicate): predicate_curie = self.prefix_manager.contract(predicate) else: predicate_curie = predicate if self.prefix_manager.is_curie(predicate_curie): reference = self.prefix_manager.get_reference(predicate_curie) else: reference = predicate_curie element = toolkit.get_element(reference) if not element: try: mapping = toolkit.get_element_by_mapping(predicate) if mapping: element = toolkit.get_element(mapping) except ValueError as e: log.error(e) return element
def __init__(self, source_graph: nx.MultiDiGraph = None, node_properties: Set = None, edge_properties: Set = None): super().__init__(source_graph) self.toolkit = get_toolkit() self.node_properties = node_properties if node_properties else set() self.edge_properties = edge_properties if edge_properties else set() self.node_properties.update([ 'biolink:same_as', 'OBAN:association_has_object', 'OBAN:association_has_subject', 'OBAN:association_has_predicate', 'OBAN:association_has_object' ]) self.edge_properties.update([ 'biolink:has_modifier', 'biolink:has_gene_product', 'biolink:has_db_xref', 'biolink:in_taxon' ]) self.edge_properties.update([ 'biolink:subclass_of', 'biolink:same_as', 'biolink:part_of', 'biolink:has_part' ]) self.assocs = set() self.count = 0 self.start = 0 self.cache = {}
def test_get_toolkit(): """ Test to get an instance of Toolkit via get_toolkit and check if default is the default biolink model version. """ tk = get_toolkit() assert isinstance(tk, Toolkit) assert tk.get_model_version() == Toolkit().get_model_version()
def get_default_model_version(cls): """ Get the Default Biolink Model version """ if not cls._default_model_version: # get default Biolink version from BMT cls._default_model_version = get_toolkit().get_model_version() return cls._default_model_version
def validate_edge_predicate(subject: str, object: str, data: dict) -> list: """ Validate ``edge_predicate`` field of a given edge. Parameters ---------- subject: str Subject identifier object: str Object identifier data: dict Edge properties Returns ------- list A list of errors for a given edge """ toolkit = get_toolkit() error_type = ErrorType.INVALID_EDGE_PREDICATE errors = [] edge_predicate = data.get('predicate') if edge_predicate is None: message = "Edge does not have an 'predicate' property" errors.append( ValidationError(f"{subject}-{object}", error_type, message, MessageLevel.ERROR)) elif not isinstance(edge_predicate, str): message = f"Edge property 'edge_predicate' expected to be of type 'string'" errors.append( ValidationError(f"{subject}-{object}", error_type, message, MessageLevel.ERROR)) else: if PrefixManager.is_curie(edge_predicate): edge_predicate = PrefixManager.get_reference(edge_predicate) m = re.match(r"^([a-z_][^A-Z\s]+_?[a-z_][^A-Z\s]+)+$", edge_predicate) if m: p = toolkit.get_element( snakecase_to_sentencecase(edge_predicate)) if p is None: message = f"Edge label '{edge_predicate}' not in Biolink Model" errors.append( ValidationError(f"{subject}-{object}", error_type, message, MessageLevel.ERROR)) elif edge_predicate != p.name and edge_predicate in p.aliases: message = f"Edge label '{edge_predicate}' is actually an alias for {p.name}; Should replace {edge_predicate} with {p.name}" errors.append( ValidationError(f"{subject}-{object}", error_type, message, MessageLevel.ERROR)) else: message = f"Edge label '{edge_predicate}' is not in snake_case form" errors.append( ValidationError(f"{subject}-{object}", error_type, message, MessageLevel.ERROR)) return errors
def __init__(self): self.toolkit = get_toolkit() self.prefix_manager = PrefixManager() self.errors = [] try: self.jsonld = requests.get(CONTEXT_JSONLD).json() except: raise Exception('Unable to download jsonld file from {}'.format(CONTEXT_JSONLD))
def __init__(self, verbose: bool = False): self.toolkit = get_toolkit() self.prefix_manager = PrefixManager() self.jsonld = get_jsonld_context() self.prefixes = Validator.get_all_prefixes(self.jsonld) self.required_node_properties = Validator.get_required_node_properties( ) self.required_edge_properties = Validator.get_required_edge_properties( ) self.verbose = verbose
def validate_categories(node: str, data: dict) -> list: """ Validate ``category`` field of a given node. Parameters ---------- node: str Node identifier data: dict Node properties Returns ------- list A list of errors for a given node """ toolkit = get_toolkit() error_type = ErrorType.INVALID_CATEGORY errors = [] categories = data.get('category') if categories is None: message = "Node does not have a 'category' property" errors.append( ValidationError(node, error_type, message, MessageLevel.ERROR)) elif not isinstance(categories, list): message = f"Node property 'category' expected to be of type {list}" errors.append( ValidationError(node, error_type, message, MessageLevel.ERROR)) else: for category in categories: if PrefixManager.is_curie(category): category = PrefixManager.get_reference(category) m = re.match(r"^([A-Z][a-z\d]+)+$", category) if not m: # category is not CamelCase error_type = ErrorType.INVALID_CATEGORY message = f"Category '{category}' is not in CamelCase form" errors.append( ValidationError(node, error_type, message, MessageLevel.ERROR)) formatted_category = camelcase_to_sentencecase(category) if not toolkit.is_category(formatted_category): message = f"Category '{category}' not in Biolink Model" errors.append( ValidationError(node, error_type, message, MessageLevel.ERROR)) else: c = toolkit.get_element(formatted_category.lower()) if category != c.name and category in c.aliases: message = f"Category {category} is actually an alias for {c.name}; Should replace '{category}' with '{c.name}'" errors.append( ValidationError(node, error_type, message, MessageLevel.ERROR)) return errors
def check_categories( categories: List, closure: List, category_mapping: Optional[Dict[str, str]] = None ) -> Tuple[List, List, List]: """ Check categories to ensure whether values in ``categories`` are valid biolink categories. Valid biolink categories are classes that descend from 'NamedThing'. Mixins, while valid ancestors, are not valid categories. Parameters ---------- categories: List A list of categories to check closure: List A list of nodes in a clique category_mapping: Optional[Dict[str, str]] A map that provides mapping from a non-biolink category to a biolink category Returns ------- Tuple[List, List, List] A tuple consisting of valid biolink categories, invalid biolink categories, and invalid categories """ valid_biolink_categories = [] invalid_biolink_categories = [] invalid_categories = [] tk = get_toolkit() for x in categories: # use the toolkit to check if the declared category is actually a mixin. if tk.is_mixin(x): invalid_categories.append(x) continue # get biolink element corresponding to category element = get_biolink_element(x) if element: mapped_category = format_biolink_category(element["name"]) if mapped_category in closure: valid_biolink_categories.append(x) else: log.warning(f"category '{mapped_category}' not in closure: {closure}") if category_mapping: mapped = category_mapping[x] if x in category_mapping.keys() else x if mapped not in closure: log.warning( f"category '{mapped_category}' is not in category_mapping." ) invalid_biolink_categories.append(x) else: invalid_biolink_categories.append(x) else: log.warning(f"category '{x}' is not in Biolink Model") invalid_categories.append(x) continue return valid_biolink_categories, invalid_biolink_categories, invalid_categories
def __init__(self, verbose: bool = False): self.toolkit = get_toolkit() self.prefix_manager = PrefixManager() self.prefixes = None self.required_node_properties = None self.required_edge_properties = None self.verbose = verbose try: self.jsonld = requests.get(CONTEXT_JSONLD).json() except: raise Exception('Unable to download JSON-LD context from {}'.format(CONTEXT_JSONLD))
def get_category_via_superclass(graph: BaseGraph, curie: str, load_ontology: bool = True) -> Set[str]: """ Get category for a given CURIE by tracing its superclass, via ``subclass_of`` hierarchy, and getting the most appropriate category based on the superclass. Parameters ---------- graph: kgx.graph.base_graph.BaseGraph Graph to traverse curie: str Input CURIE load_ontology: bool Determines whether to load ontology, based on CURIE prefix, or to simply rely on ``subclass_of`` hierarchy from graph Returns ------- Set[str] A set containing one (or more) category for the given CURIE """ log.debug("curie: {}".format(curie)) new_categories = [] toolkit = get_toolkit() if PrefixManager.is_curie(curie): ancestors = get_ancestors(graph, curie, relations=['subclass_of']) if len(ancestors) == 0 and load_ontology: cls = get_curie_lookup_service() ontology_graph = cls.ontology_graph new_categories += [ x for x in get_category_via_superclass(ontology_graph, curie, False) ] log.debug("Ancestors for CURIE {} via subClassOf: {}".format( curie, ancestors)) seen = [] for anc in ancestors: mapping = toolkit.get_by_mapping(anc) seen.append(anc) if mapping: # there is direct mapping to BioLink Model log.debug("Ancestor {} mapped to {}".format(anc, mapping)) seen_labels = [ graph.nodes()[x]['name'] for x in seen if 'name' in graph.nodes()[x] ] new_categories += [x for x in seen_labels] new_categories += [x for x in toolkit.ancestors(mapping)] break return set(new_categories)
def __init__(self, owner): super().__init__(owner) self.DEFAULT = Namespace(self.prefix_manager.prefix_map[""]) # TODO: use OBO IRI from biolink model context once # https://github.com/biolink/biolink-model/issues/211 is resolved # self.OBO = Namespace('http://purl.obolibrary.org/obo/') self.OBAN = Namespace(self.prefix_manager.prefix_map["OBAN"]) self.PMID = Namespace(self.prefix_manager.prefix_map["PMID"]) self.BIOLINK = Namespace(self.prefix_manager.prefix_map["biolink"]) self.predicate_mapping = {} self.cache: Dict = {} self.toolkit = get_toolkit() self.node_property_predicates = set([ URIRef(self.prefix_manager.expand(x)) for x in self.toolkit.get_all_node_properties(formatted=True) ]) self.node_property_predicates.update( set(self.toolkit.get_all_node_properties(formatted=True))) self.node_property_predicates.update( set(self.toolkit.get_all_edge_properties(formatted=True))) # TODO: validate expansion of the scope of this statement to include 'knowledge_source' and its descendants? for ksf in knowledge_provenance_properties: self.node_property_predicates.add( URIRef(self.prefix_manager.expand("biolink:" + ksf))) self.reification_types = { RDF.Statement, self.BIOLINK.Association, self.OBAN.association, } self.reification_predicates = { self.BIOLINK.subject, self.BIOLINK.predicate, self.BIOLINK.object, RDF.subject, RDF.object, RDF.predicate, self.OBAN.association_has_subject, self.OBAN.association_has_predicate, self.OBAN.association_has_object, } self.reified_nodes: Set = set() self.start: int = 0 self.count: int = 0 self.CACHE_SIZE = 10000 self.node_record = {} self.edge_record = {} self.node_cache = {} self.edge_cache = {} self._incomplete_nodes = {}
def __init__(self): super().__init__() self.DEFAULT = Namespace(self.prefix_manager.prefix_map['']) # TODO: use OBO IRI from biolink model context once # https://github.com/biolink/biolink-model/issues/211 is resolved # self.OBO = Namespace('http://purl.obolibrary.org/obo/') self.OBAN = Namespace(self.prefix_manager.prefix_map['OBAN']) self.PMID = Namespace(self.prefix_manager.prefix_map['PMID']) self.BIOLINK = Namespace(self.prefix_manager.prefix_map['biolink']) self.predicate_mapping = {} self.cache: Dict = {} self.toolkit = get_toolkit() self.node_property_predicates = set( [ URIRef(self.prefix_manager.expand(x)) for x in self.toolkit.get_all_node_properties(formatted=True) ] ) self.node_property_predicates.update( set(self.toolkit.get_all_node_properties(formatted=True)) ) self.node_property_predicates.update( set(self.toolkit.get_all_edge_properties(formatted=True)) ) self.node_property_predicates.add(URIRef(self.prefix_manager.expand('biolink:provided_by'))) self.reification_types = {RDF.Statement, self.BIOLINK.Association, self.OBAN.association} self.reification_predicates = { self.BIOLINK.subject, self.BIOLINK.predicate, self.BIOLINK.object, RDF.subject, RDF.object, RDF.predicate, self.OBAN.association_has_subject, self.OBAN.association_has_predicate, self.OBAN.association_has_object, } self.reified_nodes: Set = set() self.start: int = 0 self.count: int = 0 self.CACHE_SIZE = 10000 self.node_record = {} self.edge_record = {} self.node_cache = {} self.edge_cache = {} self._incomplete_nodes = {}
def __init__(self, verbose: bool = False, progress_monitor: Optional[Callable[[GraphEntityType, List], None]] = None): # formal arguments self.verbose: bool = verbose self.progress_monitor: Optional[Callable[[GraphEntityType, List], None]] = progress_monitor # internal attributes self.toolkit = get_toolkit() self.prefix_manager = PrefixManager() self.jsonld = get_jsonld_context() self.prefixes = Validator.get_all_prefixes(self.jsonld) self.required_node_properties = Validator.get_required_node_properties( ) self.required_edge_properties = Validator.get_required_edge_properties( ) self.errors: List[ValidationError] = list()
def get_required_edge_properties() -> list: """ Get all properties for an edge that are required, as defined by Biolink Model. Returns ------- list A list of required edge properties """ toolkit = get_toolkit() edge_properties = toolkit.children('association slot') required_properties = [] for p in edge_properties: element = toolkit.get_element(p) if hasattr(element, 'required') and element.required: # TODO: this should be handled by bmt formatted_name = sentencecase_to_snakecase(element.name) required_properties.append(formatted_name) return required_properties
def get_required_edge_properties() -> list: """ Get all properties for an edge that are required, as defined by Biolink Model. Returns ------- list A list of required edge properties """ toolkit = get_toolkit() edge_properties = toolkit.get_all_edge_properties() required_properties = [] for p in edge_properties: element = toolkit.get_element(p) if element and element.deprecated is None: if hasattr(element, 'required') and element.required: formatted_name = sentencecase_to_snakecase(element.name) required_properties.append(formatted_name) return required_properties
def get_biolink_element(self, predicate: Any) -> Optional[Element]: """ Returns a Biolink Model element for a given predicate. Parameters ---------- predicate: Any The CURIE of a predicate Returns ------- Optional[Element] The corresponding Biolink Model element """ toolkit = get_toolkit() if self.prefix_manager.is_iri(predicate): predicate_curie = self.prefix_manager.contract(predicate) else: predicate_curie = predicate if self.prefix_manager.is_curie(predicate_curie): reference = self.prefix_manager.get_reference(predicate_curie) else: reference = predicate_curie element = toolkit.get_element(reference) if not element: try: mapping = toolkit.get_element_by_mapping(predicate) if mapping: element = toolkit.get_element(mapping) except ValueError as e: self.owner.log_error( entity=str(predicate), error_type=ErrorType.INVALID_EDGE_PREDICATE, message=str(e)) element = None return element
def test_distinct_validator_class_versus_default_toolkit_biolink_version(): Validator.set_biolink_model(version="1.8.2") default_tk = get_toolkit() validator_tk = Validator.get_toolkit() assert default_tk.get_model_version() != validator_tk.get_model_version()
from ordered_set import OrderedSet from kgx.config import get_logger from kgx.graph.base_graph import BaseGraph from kgx.utils.kgx_utils import ( get_prefix_prioritization_map, get_biolink_element, get_biolink_ancestors, current_time_in_millis, format_biolink_category, generate_edge_key, get_toolkit, ) log = get_logger() toolkit = get_toolkit() SAME_AS = "biolink:same_as" SUBCLASS_OF = "biolink:subclass_of" LEADER_ANNOTATION = "clique_leader" ORIGINAL_SUBJECT_PROPERTY = "_original_subject" ORIGINAL_OBJECT_PROPERTY = "_original_object" def clique_merge( target_graph: BaseGraph, leader_annotation: str = None, prefix_prioritization_map: Optional[Dict[str, List[str]]] = None, category_mapping: Optional[Dict[str, str]] = None, strict: bool = True, ) -> Tuple[BaseGraph, nx.MultiDiGraph]: """
def __init__(self, source_graph: nx.MultiDiGraph = None): super().__init__(source_graph) self.ontologies = [] self.prefix_manager = PrefixManager() self.toolkit = get_toolkit()
def validate_node_property_types(node: str, data: dict) -> list: """ Checks if node properties have the expected value type. Parameters ---------- node: str Node identifier data: dict Node properties Returns ------- list A list of errors for a given node """ toolkit = get_toolkit() errors = [] error_type = ErrorType.INVALID_NODE_PROPERTY_VALUE_TYPE if not isinstance(node, str): message = "Node property 'id' expected to be of type 'string'" errors.append( ValidationError(node, error_type, message, MessageLevel.ERROR)) for key, value in data.items(): element = toolkit.get_element(key) if element: if hasattr(element, 'typeof'): if element.typeof == 'string' and not isinstance( value, str): message = f"Node property '{key}' expected to be of type '{element.typeof}'" errors.append( ValidationError(node, error_type, message, MessageLevel.ERROR)) elif (element.typeof == 'uriorcurie' and not isinstance(value, str) and not validators.url(value)): message = f"Node property '{key}' expected to be of type 'uri' or 'CURIE'" errors.append( ValidationError(node, error_type, message, MessageLevel.ERROR)) elif element.typeof == 'double' and not isinstance( value, (int, float)): message = f"Node property '{key}' expected to be of type '{element.typeof}'" errors.append( ValidationError(node, error_type, message, MessageLevel.ERROR)) else: logger.warning( "Skipping validation for Node property '{}'. Expected type '{}' vs Actual type '{}'" .format(key, element.typeof, type(value))) if hasattr(element, 'multivalued'): if element.multivalued: if not isinstance(value, list): message = f"Multi-valued node property '{key}' expected to be of type '{list}'" errors.append( ValidationError(node, error_type, message, MessageLevel.ERROR)) else: if isinstance(value, (list, set, tuple)): message = f"Single-valued node property '{key}' expected to be of type '{str}'" errors.append( ValidationError(node, error_type, message, MessageLevel.ERROR)) return errors
def test_get_toolkit(): tk = get_toolkit() assert isinstance(tk, Toolkit)
def validate_edge_property_types(subject: str, object: str, data: dict) -> list: """ Checks if edge properties have the expected value type. Parameters ---------- subject: str Subject identifier object: str Object identifier data: dict Edge properties Returns ------- list A list of errors for a given edge """ toolkit = get_toolkit() errors = [] error_type = ErrorType.INVALID_EDGE_PROPERTY_VALUE_TYPE if not isinstance(subject, str): message = "'subject' of an edge expected to be of type 'string'" errors.append( ValidationError(f"{subject}-{object}", error_type, message, MessageLevel.ERROR)) if not isinstance(object, str): message = "'object' of an edge expected to be of type 'string'" errors.append( ValidationError(f"{subject}-{object}", error_type, message, MessageLevel.ERROR)) for key, value in data.items(): element = toolkit.get_element(key) if element: if hasattr(element, 'typeof'): if element.typeof == 'string' and not isinstance( value, str): message = f"Edge property '{key}' expected to be of type 'string'" errors.append( ValidationError(f"{subject}-{object}", error_type, message, MessageLevel.ERROR)) elif (element.typeof == 'uriorcurie' and not isinstance(value, str) and not validators.url(value)): message = f"Edge property '{key}' expected to be of type 'uri' or 'CURIE'" errors.append( ValidationError(f"{subject}-{object}", error_type, message, MessageLevel.ERROR)) elif element.typeof == 'double' and not isinstance( value, (int, float)): message = f"Edge property '{key}' expected to be of type 'double'" errors.append( ValidationError(f"{subject}-{object}", error_type, message, MessageLevel.ERROR)) else: logger.warning( "Skipping validation for Edge property '{}'. Expected type '{}' vs Actual type '{}'" .format(key, element.typeof, type(value))) if hasattr(element, 'multivalued'): if element.multivalued: if not isinstance(value, list): message = ( f"Multi-valued edge property '{key}' expected to be of type 'list'" ) errors.append( ValidationError(f"{subject}-{object}", error_type, message, MessageLevel.ERROR)) else: if isinstance(value, (list, set, tuple)): message = ( f"Single-valued edge property '{key}' expected to be of type 'str'" ) errors.append( ValidationError(f"{subject}-{object}", error_type, message, MessageLevel.ERROR)) return errors
def clique_merge(graph: nx.Graph, report=False) -> nx.Graph: """ Builds up cliques using the `same_as` attribute of each node. Uses those cliques to build up a mapping for relabelling nodes. Chooses labels so as to preserve the original nodes, rather than taking xrefs that don't appear as nodes in the graph. This method will also expand the `same_as` attribute of the nodes to include the discovered clique. """ original_size = len(graph) print('original graph has {} nodes'.format(original_size)) cliqueGraph = nx.Graph() with click.progressbar( graph.nodes(data=True), label='building cliques from same_as node property') as bar: for n, attr_dict in bar: if 'same_as' in attr_dict: for m in attr_dict['same_as']: cliqueGraph.add_edge(n, m) with click.progressbar(graph.edges(data=True), label='building cliques from same_as edges') as bar: for u, v, attr_dict in bar: if 'edge_label' in attr_dict and attr_dict[ 'edge_label'] == 'same_as': cliqueGraph.add_edge(u, v) edges = [] with click.progressbar(cliqueGraph.edges(), label='Breaking invalid cliques') as bar: for u, v in bar: try: u_categories = graph.node[u].get('category', []) v_categories = graph.node[v].get('category', []) except: continue l = len(edges) for a in u_categories: if len(edges) > l: break if get_toolkit().get_element(a) is None: continue for b in v_categories: if get_toolkit().get_element(b) is None: continue a_ancestors = get_toolkit().ancestors(a) b_ancestors = get_toolkit().ancestors(b) if a_ancestors == b_ancestors == []: continue elif a not in b_ancestors and b not in a_ancestors: edges.append((u, v)) break print('breaking {} many edges'.format(len(edges))) cliqueGraph.remove_edges_from(edges) mapping = {} connected_components = list(nx.connected_components(cliqueGraph)) print('Discovered {} cliques'.format(len(connected_components))) with click.progressbar(connected_components, label='building mapping') as bar: for nodes in bar: nodes = list(nodes) categories = set() for n in nodes: if not graph.has_node(n): continue attr_dict = graph.node[n] attr_dict['same_as'] = nodes if 'category' in attr_dict: categories.update(listify(attr_dict['category'])) if 'categories' in attr_dict: categories.update(listify(attr_dict['categories'])) list_of_prefixes = [] for category in categories: try: list_of_prefixes.append( get_toolkit().get_element(category).id_prefixes) except: pass nodes.sort() nodes.sort(key=build_sort_key(list_of_prefixes)) for n in nodes: if n != nodes[0]: mapping[n] = nodes[0] g = relabel_nodes(graph, mapping) edges = [] for u, v, key, data in g.edges(keys=True, data=True): if data.get('edge_label') == 'same_as': edges.append((u, v, key)) g.remove_edges_from(edges) for n, data in g.nodes(data=True): data['iri'] = expand_uri(n) if 'id' in data and data['id'] != n: data['id'] = n if 'same_as' in data and n in data['same_as']: data['same_as'].remove(n) if data['same_as'] == []: del data['same_as'] final_size = len(g) print('Resulting graph has {} nodes'.format(final_size)) print('Eliminated {} nodes'.format(original_size - final_size)) return g
def set_biolink_model(cls, version: Optional[str]): """ Set Biolink Model version of Validator Toolkit """ cls._currently_active_toolkit = get_toolkit(biolink_release=version)