def _bfs( self, kg: KG, entity: Vertex, is_reverse: bool = False ) -> List[Walk]: """Extracts random walks for an entity based on Knowledge Graph using the Breath First Search (BFS) algorithm. Args: kg: The Knowledge Graph. entity: The root node to extract walks. is_reverse: True to get the parent neighbors instead of the child neighbors, False otherwise. Defaults to False. Returns: The list of unique walks for the provided entity. """ walks: Set[Walk] = {(entity,)} for i in range(self.max_depth): for walk in walks.copy(): if is_reverse: hops = kg.get_hops(walk[0], True) for pred, obj in hops: walks.add((obj, pred) + walk) else: hops = kg.get_hops(walk[-1]) for pred, obj in hops: walks.add(walk + (pred, obj)) if len(hops) > 0: walks.remove(walk) return list(walks)
def _bfs( self, kg: KG, root: Vertex, is_reverse: bool = False ) -> List[Walk]: """Extracts random walks with Breadth-first search. Args: kg: The Knowledge Graph. root: The root node to extract walks. is_reverse: True to get the parent neighbors instead of the child neighbors, False otherwise. Defaults to False. Returns: The list of walks for the root node. """ walks: Set[Walk] = {(root,)} for i in range(self.max_depth): for walk in walks.copy(): if is_reverse: hops = kg.get_hops(walk[0], True) for pred, obj in hops: walks.add((obj, pred) + walk) else: hops = kg.get_hops(walk[-1]) for pred, obj in hops: walks.add(walk + (pred, obj)) if len(hops) > 0: walks.remove(walk) return list(walks)
def _extract( self, kg: KG, instance: rdflib.URIRef ) -> Dict[Any, Tuple[Tuple[str, ...], ...]]: """Extracts walks rooted at the provided instances which are then each transformed into a numerical representation. Args: kg: The Knowledge Graph. The graph from which the neighborhoods are extracted for the provided instances. instance: The instance to be extracted from the Knowledge Graph. Returns: The 2D matrix with its number of rows equal to the number of provided instances; number of column equal to the embedding size. """ canonical_walks = set() walks = self.extract_random_walks(kg, str(instance)) for walk in walks: kg.get_hops(walk[-1]) # type: ignore self._weisfeiler_lehman(kg) for n in range(self.wl_iterations + 1): for walk in walks: canonical_walk = [] for i, hop in enumerate(walk): # type: ignore if i == 0 or i % 2 == 1: canonical_walk.append(str(hop)) else: canonical_walk.append(self._label_map[hop][n]) canonical_walks.add(tuple(canonical_walk)) return {instance: tuple(canonical_walks)}
def test_invalid_file(self): with pytest.raises(FileNotFoundError): KG( "foo", label_predicates=LABEL_PREDICATES, ) with pytest.raises(FileNotFoundError): KG( "samples/mutag/", label_predicates=LABEL_PREDICATES, )
def extract(self, kg: KG, entities: Entities, verbose: int = 0) -> List[List[SWalk]]: """Fits the provided sampling strategy and then calls the private _extract method that is implemented for each of the walking strategies. Args: kg: The Knowledge Graph. entities: The entities to be extracted from the Knowledge Graph. verbose: The verbosity level. 0: does not display anything; 1: display of the progress of extraction and training of walks; 2: debugging. Defaults to 0. Returns: The 2D matrix with its number of rows equal to the number of provided entities; number of column equal to the embedding size. Raises: WalkerNotSupported: If there is an attempt to use an invalid walking strategy to a remote Knowledge Graph. """ if kg._is_remote and not self._is_support_remote: raise WalkerNotSupported( "Invalid walking strategy. Please, choose a walking strategy " + "that can fetch walks via a SPARQL endpoint server.") self.sampler.fit(kg) process = self.n_jobs if self.n_jobs is not None else 1 if (kg._is_remote and kg.mul_req) and process >= 2: warnings.warn( "Using 'mul_req=True' and/or 'n_jobs>=2' speed up the " + "extraction of entity's walks, but may violate the policy " + "of some SPARQL endpoint servers.", category=RuntimeWarning, stacklevel=2, ) if kg._is_remote and kg.mul_req: kg._fill_hops(entities) with multiprocessing.Pool(process, self._init_worker, [kg]) as pool: res = list( tqdm( pool.imap(self._proc, entities), total=len(entities), disable=True if verbose == 0 else False, )) return self._post_extract(res)
def test_add_walk(self): kg = KG(skip_predicates={f"{URL}#predicate"}) for row in GRAPH: subj = Vertex(f"{URL}#{row[0]}") obj = Vertex((f"{URL}#{row[2]}")) pred = Vertex((f"{URL}#{row[1]}"), predicate=True, vprev=subj, vnext=obj) assert kg.add_walk(subj, pred, obj) is True subj = Vertex(f"{URL}#{GRAPH[0][0]}") obj = Vertex(f"{URL}#{GRAPH[0][2]}") pred = Vertex(f"{URL}#predicate") assert kg.add_walk(subj, pred, obj) is False
def test_invalid_url(self): with pytest.raises(ValueError): KG( "foo", label_predicates=LABEL_PREDICATES, is_remote=True, )
def transform(self, kg: KG, entities: Entities) -> Tuple[Embeddings, Literals]: """Transforms the provided entities into embeddings and literals. Args: kg: The Knowledge Graph. entities: The entities including test entities to create the embeddings. Since RDF2Vec is unsupervised, there is no label leakage. Returns: The embeddings and the literals of the provided entities. """ assert self.embedder is not None embeddings = self.embedder.transform(entities) tic = time.perf_counter() literals = kg.get_literals(entities, self.verbose) toc = time.perf_counter() self._update(self._embeddings, embeddings) if len(literals) > 0: self._update(self._literals, literals) if kg._is_remote and kg.mul_req: self._is_extract_walks_literals = False asyncio.run(kg.connector.close()) if self.verbose >= 1 and len(literals) > 0: print(f"Extracted {len(literals)} literals for {len(entities)} " + f"entities ({toc - tic:0.4f}s)") return embeddings, literals
def _create_label(self, kg: KG, vertex: Vertex, n: int) -> str: """Creates a label according to a vertex and its neighbors. kg: The Knowledge Graph. The graph from which the neighborhoods are extracted for the provided entities. vertex: The vertex to get its neighbors to create the suffix. n: The index of the neighbor Returns: the label created for the vertex. """ if len(self._label_map) == 0: self._weisfeiler_lehman(kg) suffix = "-".join( sorted( set( [ self._label_map[neighbor][n - 1] for neighbor in kg.get_neighbors( vertex, is_reverse=True ) ] ) ) ) return f"{self._label_map[vertex][n - 1]}-{suffix}"
def sample_neighbor(self, kg: KG, walk, last): not_tag_neighbors = [ x for x in kg.get_hops(walk[-1]) if (x, len(walk)) not in self.visited ] # If there are no untagged neighbors, then tag # this vertex and return None if len(not_tag_neighbors) == 0: if len(walk) > 2: self.visited.add(((walk[-2], walk[-1]), len(walk) - 2)) return None weights = [self.get_weight(hop) for hop in not_tag_neighbors] if self.inverse: weights = [max(weights) - (x - min(weights)) for x in weights] if self.split: weights = [ w / self.degrees[v[1]] for w, v in zip(weights, not_tag_neighbors) ] weights = [x / sum(weights) for x in weights] # Sample a random neighbor and add them to visited if needed. rand_ix = np.random.choice(range(len(not_tag_neighbors)), p=weights) if last: self.visited.add((not_tag_neighbors[rand_ix], len(walk))) return not_tag_neighbors[rand_ix]
def _bfs(self, kg: KG, root: Vertex, is_reverse: bool = False) -> List[Walk]: """Extracts random walks of depth - 1 hops rooted in root with Breadth-first search. Args: kg: The Knowledge Graph. The graph from which the neighborhoods are extracted for the provided entities. root: The root node to extract walks. is_reverse: True to get the parent neighbors instead of the child neighbors, False otherwise. Defaults to False. Returns: The list of walks for the root node according to the depth and max_walks. """ walks: Set[Walk] = {(root, )} for i in range(self.max_depth): for walk in walks.copy(): if is_reverse: hops = kg.get_hops(walk[0], True) for pred, obj in hops: walks.add((obj, pred) + walk) if (obj in self.communities and np.random.RandomState( self.random_state).random() < self.hop_prob): walks.add((np.random.RandomState(self.random_state) .choice(self.labels_per_community[ self.communities[obj]]), ) + walk) else: hops = kg.get_hops(walk[-1]) for pred, obj in hops: walks.add(walk + (pred, obj)) if (obj in self.communities and np.random.RandomState( self.random_state).random() < self.hop_prob): walks.add(walk + (np.random.RandomState(self.random_state) .choice(self.labels_per_community[ self.communities[obj]]), )) if len(hops) > 0: walks.remove(walk) return list(walks)
def _community_detection(self, kg: KG) -> None: """Converts the knowledge graph to a networkX graph. Note: You can create a `graph.KnowledgeGraph` object from an `rdflib.Graph` object by using a converter method. Args: kg: The Knowledge Graph. The graph from which the neighborhoods are extracted for the provided instances. """ nx_graph = nx.Graph() for v in kg._vertices: if not v.predicate: name = str(v) nx_graph.add_node(name, vertex=v) for v in kg._vertices: if not v.predicate: v_name = str(v) # Neighbors are predicates for pred in kg.get_neighbors(v): pred_name = str(pred) for obj in kg.get_neighbors(pred): obj_name = str(obj) nx_graph.add_edge(v_name, obj_name, name=pred_name) # This will create a dictionary that maps the URI on a community partition = community.best_partition(nx_graph, resolution=self.resolution) self.labels_per_community = defaultdict(list) self.communities = {} vertices = nx.get_node_attributes(nx_graph, "vertex") for node in partition: if node in vertices: self.communities[vertices[node]] = partition[node] for node in self.communities: self.labels_per_community[self.communities[node]].append(node)
def fit(self, kg: KG) -> None: """Fits the embedding network based on provided Knowledge Graph. Args: kg: The Knowledge Graph. """ super().fit(kg) nx_graph = nx.DiGraph() for vertex in kg._vertices: if not vertex.predicate: nx_graph.add_node(vertex.name, vertex=vertex) for predicate in kg.get_neighbors(vertex): for obj in kg.get_neighbors(predicate): nx_graph.add_edge(vertex.name, obj.name, name=predicate.name) self.pageranks = nx.pagerank(nx_graph, alpha=self.alpha)
def fit(self, kg: KG) -> None: """Fits the sampling strategy by running PageRank on a provided KG according to the specified damping. Args: kg: The Knowledge Graph. """ super().fit(kg) nx_graph = nx.DiGraph() for vertex in kg._vertices: if not vertex.predicate: nx_graph.add_node(vertex.name, vertex=vertex) for predicate in kg.get_neighbors(vertex): for obj in kg.get_neighbors(predicate): nx_graph.add_edge(vertex.name, obj.name, name=predicate.name) self._pageranks = nx.pagerank(nx_graph, alpha=self.alpha)
def test_get_neighbors(self): remote_kg = KG(SPARQL_ENDPOINT, is_remote=True) for graph in [LOCAL_KG, remote_kg]: neighbors = graph.get_hops(f"{URL}#Alice") predicates = [neighbor[0] for neighbor in neighbors] assert {str(predicate) for predicate in predicates} == {f"{URL}#knows"} objects = [neighbor[1] for neighbor in neighbors] assert Vertex(f"{URL}#Bob") in objects assert Vertex(f"{URL}#Dean") in objects
def fit(self, kg: KG) -> None: """Fits the embedding network based on provided Knowledge Graph. Args: kg: The Knowledge Graph. """ super().fit(kg) self.counts = {} for vertex in kg._vertices: if not vertex.predicate: self.counts[vertex.name] = len(kg.get_inv_neighbors(vertex))
def _community_detection(self, kg: KG) -> None: """Converts the knowledge graph to a networkX graph. Note: You can create a `graph.KnowledgeGraph` object from an `rdflib.Graph` object by using a converter method. Args: kg: The Knowledge Graph. """ nx_graph = nx.Graph() for vertex in kg._vertices: if not vertex.predicate: nx_graph.add_node(str(vertex), vertex=vertex) for vertex in kg._vertices: if not vertex.predicate: # Neighbors are predicates for pred in kg.get_neighbors(vertex): for obj in kg.get_neighbors(pred): nx_graph.add_edge(str(vertex), str(obj), name=str(pred)) # Create a dictionary that maps the URI on a community partition = community.best_partition(nx_graph, resolution=self.resolution) self.labels_per_community = defaultdict(list) self.communities = {} vertices = nx.get_node_attributes(nx_graph, "vertex") for node in partition: if node in vertices: self.communities[vertices[node]] = partition[node] for node in self.communities: self.labels_per_community[self.communities[node]].append(node)
def fit(self, kg: KG) -> None: """Fits the sampling strategy by counting the number of available neighbors for each vertex. Args: kg: The Knowledge Graph. """ super().fit(kg) for vertex in kg._vertices: if not vertex.predicate: self._counts[vertex.name] = len( kg.get_neighbors(vertex, is_reverse=True))
def extract( self, kg: KG, instances: List[rdflib.URIRef], verbose=False ) -> Set[Tuple[Any, ...]]: """Fits the provided sampling strategy and then calls the private _extract method that is implemented for each of the walking strategies. Args: kg: The Knowledge Graph. The graph from which the neighborhoods are extracted for the provided instances. instances: The instances to be extracted from the Knowledge Graph. verbose: If true, display a progress bar for the extraction of the walks. Returns: The 2D matrix with its number of rows equal to the number of provided instances; number of column equal to the embedding size. """ if kg.is_remote and not self.is_support_remote: raise RemoteNotSupported( "Invalid walking strategy. Please, choose a walking strategy " + "that can retrieve walks via a SPARQL endpoint server." ) self.sampler.fit(kg) canonical_walks = set() # To avoid circular imports if "CommunityWalker" in str(self): self._community_detection(kg) # type: ignore if kg.is_remote: asyncio.run(kg._fill_entity_hops(instances)) # type: ignore with multiprocessing.Pool( self.n_jobs, self._init_worker, [kg] ) as pool: res = list( tqdm( pool.imap_unordered(self._proc, instances), total=len(instances), disable=not verbose, ) ) res = {k: v for elm in res for k, v in elm.items()} # type: ignore for instance in instances: canonical_walks.update(res[instance]) return canonical_walks
def fit(self, kg: KG) -> None: """Fits the embedding network based on provided Knowledge Graph. Args: kg: The Knowledge Graph. """ super().fit(kg) self.counts: DefaultDict[Any, Any] = defaultdict(int) for vertex in kg._vertices: if vertex.predicate: # Always one object associated with this predicate obj = list(kg.get_neighbors(vertex))[0] self.counts[(vertex.name, obj.name)] += 1
def fit(self, kg: KG) -> None: """Fits the embedding network based on provided Knowledge Graph. Args: kg: The Knowledge Graph. """ if kg.is_remote and not self.remote_supported: raise ValueError("This sampler is not supported for remote KGs.") if self.split: self.degrees = {} for vertex in kg._vertices: if not vertex.predicate: self.degrees[vertex.name] = len( kg.get_inv_neighbors(vertex))
def _create_label(self, kg: KG, vertex: Vertex, n: int): """Creates a label. kg: The Knowledge Graph. The graph from which the neighborhoods are extracted for the provided instances. vertex: The vertex. n: The position. """ neighbor_names = [ self._label_map[neighbor][n - 1] for neighbor in kg.get_inv_neighbors(vertex) ] suffix = "-".join(sorted(set(map(str, neighbor_names)))) return self._label_map[vertex][n - 1] + "-" + suffix
def fit(self, kg: KG) -> None: """Fits the sampling strategy by running PageRank on a provided KG according to the specified damping. Args: kg: The Knowledge Graph. """ super().fit(kg) nx_graph = nx.DiGraph() subs_objs = [vertex for vertex in kg._vertices if not vertex.predicate] for vertex in subs_objs: nx_graph.add_node(vertex.name, vertex=vertex) for hop in kg.get_hops(vertex): nx_graph.add_edge(vertex.name, hop[1].name, name=hop[0].name) self._pageranks = nx.pagerank(nx_graph, alpha=self.alpha)
def get_walks(self, kg: KG, entities: Entities) -> List[List[SWalk]]: """Gets the walks of an entity based on a Knowledge Graph and a list of walkers Args: kg: The Knowledge Graph. entities: The entities including test entities to create the embeddings. Since RDF2Vec is unsupervised, there is no label leakage. Returns: The walks for the given entities. Raises: ValueError: If the provided entities aren't in the Knowledge Graph. """ if kg.skip_verify is False and not kg.is_exist(entities): if kg.mul_req: asyncio.run(kg.connector.close()) raise ValueError( "At least one provided entity does not exist in the " + "Knowledge Graph.") if self.verbose == 2: print(kg) print(self.walkers[0]) walks: List[List[SWalk]] = [] tic = time.perf_counter() for walker in self.walkers: walks += walker.extract(kg, entities, self.verbose) toc = time.perf_counter() self._update(self._entities, entities) self._update(self._walks, walks) if self.verbose >= 1: n_walks = sum([len(entity_walks) for entity_walks in walks]) print(f"Extracted {n_walks} walks " + f"for {len(entities)} entities ({toc - tic:0.4f}s)") if (kg._is_remote and kg.mul_req and not self._is_extract_walks_literals): asyncio.run(kg.connector.close()) return walks
def fit(self, kg: KG) -> None: """Fits the sampling strategy by counting the number of occurance of having two neighboring vertices. Args: kg: The Knowledge Graph. """ super().fit(kg) for vertex in kg._vertices: if vertex.predicate: neighbors = list(kg.get_neighbors(vertex)) if len(neighbors) > 0: obj = neighbors[0] if (vertex.name, obj.name) in self._counts: self._counts[(vertex.name, obj.name)] += 1 else: self._counts[(vertex.name, obj.name)] = 1
def fit(self, kg: KG) -> None: """Fits the sampling strategy by counting the number of occurrences of an object belonging to a subject. Args: kg: The Knowledge Graph. """ super().fit(kg) for vertex in kg._vertices: if vertex.predicate: objs = list(kg.get_neighbors(vertex)) if objs: obj = objs[0] if (vertex.name, obj.name) in self._counts: self._counts[(vertex.name, obj.name)] += 1 else: self._counts[(vertex.name, obj.name)] = 1
def sample_hop( self, kg: KG, walk: Walk, is_last_hop: bool, is_reverse: bool = False ) -> Optional[Hop]: """Samples an unvisited random hop in the (predicate, object) form, according to the weight of hops for a given walk. Args: kg: The Knowledge Graph. walk: The walk with one or several vertices. is_last_hop: True if the next hop to be visited is the last one for the desired depth, False otherwise. is_reverse: True to get the parent neighbors instead of the child neighbors, False otherwise. Defaults to False. Returns: An unvisited hop in the (predicate, object) form. """ subj = walk[0] if is_reverse else walk[-1] untagged_neighbors = [ pred_obj for pred_obj in kg.get_hops(subj, is_reverse) if (pred_obj, len(walk)) not in self.visited ] if len(untagged_neighbors) == 0: if len(walk) > 2: pred_obj = ( (walk[1], walk[0]) if is_reverse else (walk[-2], walk[-1]) ) self.visited.add((pred_obj, len(walk) - 2)) return None rnd_id = np.random.RandomState(self._random_state).choice( range(len(untagged_neighbors)), p=self.get_weights(untagged_neighbors), ) if is_last_hop: self.visited.add((untagged_neighbors[rnd_id], len(walk))) return untagged_neighbors[rnd_id]
def fit(self, kg: KG) -> None: """Fits the sampling strategy. Args: kg: The Knowledge Graph. Raises: SamplerNotSupported: If there is an attempt to use an invalid sampling strategy to a remote Knowledge Graph. """ if kg._is_remote and not self._is_support_remote: raise SamplerNotSupported( "Invalid sampling strategy. Please, choose a sampling strategy" + " that can fetch walks via a SPARQL endpoint server.") if self.split: for vertex in kg._vertices: if not vertex.predicate: self._vertices_deg[vertex.name] = len( kg.get_neighbors(vertex, is_reverse=True))
def fit(self, kg: KG) -> None: """Fits the sampling strategy by couting the number of available neighbors for each vertex, but also by counting the number of occurrence that a predicate and an object appears in the Knowledge Graph. Args: kg: The Knowledge Graph. """ super().fit(kg) for vertex in kg._vertices: is_reverse = True if vertex.predicate else False counter = self._pred_degs if vertex.predicate else self._obj_degs self._neighbor_counts[vertex.name] = len( kg.get_neighbors(vertex, is_reverse=is_reverse)) if vertex.name in counter: counter[vertex.name] += 1 else: counter[vertex.name] = 1
def extract_random_walks_bfs(self, kg: KG, root: str): """Breadth-first search to extract all possible walks. Args: kg: The Knowledge Graph. The graph from which the neighborhoods are extracted for the provided entities. root: The root node. Returns: The list of the walks. """ walks = {(root, )} for i in range(self.depth): for walk in walks.copy(): hops = kg.get_hops(walk[-1]) if len(hops) > 0: walks.remove(walk) for (pred, obj) in hops: walks.add(walk + (pred, obj)) # type: ignore return list(walks)