def test_determinism(self, corpus): w1 = ( Word2Vec(workers=1) .fit(corpus) .transform([f"{URL}#{entity}" for entity in ROOTS_WITHOUT_URL]) ) w2 = ( Word2Vec(workers=1) .fit(corpus) .transform([f"{URL}#{entity}" for entity in ROOTS_WITHOUT_URL]) ) assert np.array_equal(w1, w2)
def test_transform(self, corpus): w2v = Word2Vec() w2v.fit(corpus) embeddings = w2v.transform( [f"{URL}#{entity}" for entity in ROOTS_WITHOUT_URL] ) assert len(embeddings) > 0
def create_embeddings( kg: KG, entities: List[rdflib.URIRef], split: int, walkers: Sequence[Walker], sg: int = 1, ) -> Tuple[List[str], List[str]]: """Creates embeddings for a list of entities according to a knowledge graphs and a walking strategy. Args: kg: The knowledge graph. The graph from which the neighborhoods are extracted for the provided instances. entities: The train and test instances to create the embedding. split: Split value for train and test embeddings. walker: The list of walkers strategies. sg: The training algorithm. 1 for skip-gram; otherwise CBOW. Defaults to 1. Returns: The embeddings of the provided instances. """ transformer = RDF2VecTransformer(Word2Vec(sg=sg), walkers=walkers) walk_embeddings = transformer.fit_transform(kg, entities) return walk_embeddings[:split], walk_embeddings[split:]
def __init__( self, embedder: Optional[Embedder] = None, walkers: Optional[Sequence[Walker]] = None, ): if embedder is not None: self.embedder = embedder else: self.embedder = Word2Vec() if walkers is not None: self.walkers = walkers else: self.walkers = [RandomWalker(2, None)] self.walks_: List[rdflib.URIRef] = []
def test_online_training(self, corpus): w2v = Word2Vec(workers=1) w2v.fit(corpus) embeddings = w2v.transform( [f"{URL}#{entity}" for entity in ROOTS_WITHOUT_URL] ) assert len(embeddings) == 3 corpus.append( [ ( "http://pyRDF2Vec#Alice", "http://pyRDF2Vec#knows", "http://pyRDF2Vec#Casper", "http://pyRDF2Vec#knows", "http://pyRDF2Vec#Mario", ) ] ) w2v.fit(corpus, True) embeddings = w2v.transform( [f"{URL}#{entity}" for entity in ROOTS_WITHOUT_URL] ) assert len(embeddings) == 3
from sklearn.manifold import TSNE from pyrdf2vec import RDF2VecTransformer from pyrdf2vec.embedders import Word2Vec from pyrdf2vec.graphs import KG from pyrdf2vec.walkers import RandomWalker # Ensure the determinism of this script by initializing a pseudo-random number. RANDOM_STATE = 22 data = pd.read_csv("samples/countries-cities/entities.tsv", sep="\t") transformer = RDF2VecTransformer( # Use one worker threads for Word2Vec to ensure random determinism. # Must be used with PYTHONHASHSEED. Word2Vec(workers=1), # Extract a maximum of 10 walks of a maximum depth of 4 for each entity # using two processes and use a random state to ensure that the same walks # are generated for the entities. walkers=[RandomWalker(4, 10, n_jobs=2, random_state=RANDOM_STATE)], verbose=1, ) # Train and save the Word2Vec model according to the KG, the entities, and # a walking strategy. embeddings, _ = transformer.fit_transform( # Defined the DBpedia endpoint server, as well as a set of predicates to # exclude from this KG. KG( "https://dbpedia.org/sparql", skip_predicates={"www.w3.org/1999/02/22-rdf-syntax-ns#type"},
def test_invalid_transform(self): with pytest.raises(ValueError): Word2Vec().transform( [f"{URL}#{entity}" for entity in ROOTS_WITHOUT_URL] )
def test_fit(self, corpus, root): w2v = Word2Vec() with pytest.raises(KeyError): w2v._model.wv[f"{URL}#{root}"] w2v.fit(corpus, False) assert len(w2v._model.wv[f"{URL}#{root}"]) > 0
from pyrdf2vec.graphs import KG from pyrdf2vec.samplers import UniformSampler from pyrdf2vec.walkers import RandomWalker, Walker import numpy as np import logging logging.basicConfig( filename="rdf2vec.log", level=logging.INFO, format= '%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') # Define the label predicates, all triples with these predicates # will be excluded from the graph logging.info("Read in knowledge graph.") label_predicates = [] kg = KG(location="data/dbp_graph.ttl", label_predicates=label_predicates) logging.info("Create walkers and transformers.") walkers = [RandomWalker(4, 5, UniformSampler())] transformer = RDF2VecTransformer(Word2Vec(sg=1), walkers=walkers) logging.info("Read in entities.") # Entities should be a list of URIs that can be found in the Knowledge Graph entities = list(np.load("data/entities.npy", allow_pickle=True)) logging.info("Calculate embeddings.") embeddings = transformer.fit_transform(kg, entities) logging.info("Write embeddings to disk.") np.save("data/embeddings.npy", embeddings) logging.info("Finished job.")
test_data = pd.read_csv("samples/mutag/test.tsv", sep="\t") train_data = pd.read_csv("samples/mutag/train.tsv", sep="\t") train_entities = [entity for entity in train_data["bond"]] train_labels = list(train_data["label_mutagenic"]) test_entities = [entity for entity in test_data["bond"]] test_labels = list(test_data["label_mutagenic"]) entities = train_entities + test_entities labels = train_labels + test_labels embeddings, literals = RDF2VecTransformer( # Ensure random determinism for Word2Vec. # Must be used with PYTHONHASHSEED. Word2Vec(workers=1, epochs=10), # Extract all walks with a maximum depth of 2 for each entity using two # processes and use a random state to ensure that the same walks are # generated for the entities. walkers=[RandomWalker(2, None, n_jobs=2, random_state=RANDOM_STATE)], verbose=1, ).fit_transform( KG( "samples/mutag/mutag.owl", skip_predicates={"http://dl-learner.org/carcinogenesis#isMutagenic"}, literals=[ [ "http://dl-learner.org/carcinogenesis#hasAtom", "http://dl-learner.org/carcinogenesis#charge", ], ],
class RDF2VecTransformer: """Transforms nodes in a Knowledge Graph into an embedding. Attributes: _embeddings: All the embeddings of the model. Defaults to []. _entities: All the entities of the model. Defaults to []. _is_extract_walks_literals: True if the session must be closed after the call to the `transform` function. False, otherwise. Defaults to False. _literals: All the literals of the model. Defaults to []. _pos_entities: The positions of existing entities to be updated. Defaults to []. _pos_walks: The positions of existing walks to be updated. Defaults to []. _walks: All the walks of the model. Defaults to []. embedder: The embedding technique. Defaults to Word2Vec. walkers: The walking strategies. Defaults to [RandomWalker(2, None)] verbose: The verbosity level. 0: does not display anything; 1: display of the progress of extraction and training of walks; 2: debugging. Defaults to 0. """ embedder = attr.ib( factory=lambda: Word2Vec(), type=Embedder, validator=attr.validators.instance_of(Embedder), # type: ignore ) walkers = attr.ib( factory=lambda: [RandomWalker(2)], # type: ignore type=Sequence[Walker], validator=attr.validators.deep_iterable( member_validator=attr.validators.instance_of( Walker # type: ignore ), iterable_validator=attr.validators.instance_of(list), ), ) verbose = attr.ib( kw_only=True, default=0, type=int, validator=attr.validators.in_([0, 1, 2]), ) _is_extract_walks_literals = attr.ib( init=False, default=False, type=bool, repr=False, validator=attr.validators.instance_of(bool), ) _embeddings = attr.ib(init=False, type=Embeddings, factory=list) _entities = attr.ib(init=False, type=Entities, factory=list) _literals = attr.ib(init=False, type=Literals, factory=list) _walks = attr.ib(init=False, type=List[List[SWalk]], factory=list) _pos_entities = attr.ib(init=False, type=List[str], factory=list) _pos_walks = attr.ib(init=False, type=List[int], factory=list) def fit(self, walks: List[List[SWalk]], is_update: bool = False) -> RDF2VecTransformer: """Fits the embeddings based on the provided entities. Args: walks: The walks to fit. is_update: True if the new corpus should be added to old model's corpus, False otherwise. Defaults to False. Returns: The RDF2VecTransformer. """ if self.verbose == 2: print(self.embedder) tic = time.perf_counter() self.embedder.fit(walks, is_update) toc = time.perf_counter() if self.verbose >= 1: n_walks = sum([len(entity_walks) for entity_walks in walks]) print(f"Fitted {n_walks} walks ({toc - tic:0.4f}s)") if len(self._walks) != len(walks): n_walks = sum( [len(entity_walks) for entity_walks in self._walks]) print(f"> {n_walks} walks extracted " + f"for {len(self._entities)} entities.") return self def fit_transform(self, kg: KG, entities: Entities, is_update: bool = False) -> Tuple[Embeddings, Literals]: """Creates a model and generates embeddings and literals for the provided entities. Args: kg: The Knowledge Graph. entities: The entities including test entities to create the embeddings. Since RDF2Vec is unsupervised, there is no label leakage. is_update: True if the new corpus should be added to old model's corpus, False otherwise. Defaults to False. Returns: The embeddings and the literals of the provided entities. """ self._is_extract_walks_literals = True self.fit(self.get_walks(kg, entities), is_update) return self.transform(kg, entities) def get_walks(self, kg: KG, entities: Entities) -> List[List[SWalk]]: """Gets the walks of an entity based on a Knowledge Graph and a list of walkers Args: kg: The Knowledge Graph. entities: The entities including test entities to create the embeddings. Since RDF2Vec is unsupervised, there is no label leakage. Returns: The walks for the given entities. Raises: ValueError: If the provided entities aren't in the Knowledge Graph. """ if not kg._is_remote and not all( [Vertex(entity) in kg._vertices for entity in entities]): raise ValueError( "The provided entities must be in the Knowledge Graph.") # Avoids duplicate entities for unnecessary walk extractions. entities = list(set(entities)) if self.verbose == 2: print(kg) print(self.walkers[0]) walks: List[List[SWalk]] = [] tic = time.perf_counter() for walker in self.walkers: walks += walker.extract(kg, entities, self.verbose) toc = time.perf_counter() self._update(self._entities, entities) self._update(self._walks, walks) if self.verbose >= 1: n_walks = sum([len(entity_walks) for entity_walks in walks]) print(f"Extracted {n_walks} walks " + f"for {len(entities)} entities ({toc - tic:0.4f}s)") if (kg._is_remote and kg.mul_req and not self._is_extract_walks_literals): asyncio.run(kg.connector.close()) return walks def transform(self, kg: KG, entities: Entities) -> Tuple[Embeddings, Literals]: """Transforms the provided entities into embeddings and literals. Args: kg: The Knowledge Graph. entities: The entities including test entities to create the embeddings. Since RDF2Vec is unsupervised, there is no label leakage. Returns: The embeddings and the literals of the provided entities. """ assert self.embedder is not None embeddings = self.embedder.transform(entities) tic = time.perf_counter() literals = kg.get_literals(entities, self.verbose) toc = time.perf_counter() self._update(self._embeddings, embeddings) if len(literals) > 0: self._update(self._literals, literals) if kg._is_remote and kg.mul_req: self._is_extract_walks_literals = False asyncio.run(kg.connector.close()) if self.verbose >= 1 and len(literals) > 0: print(f"Extracted {len(literals)} literals for {len(entities)} " + f"entities ({toc - tic:0.4f}s)") return embeddings, literals def save(self, filename: str = "transformer_data") -> None: """Saves a RDF2VecTransformer object. Args: filename: The binary file to save the RDF2VecTransformer object. """ with open(filename, "wb") as f: pickle.dump(self, f) def _update(self, attr, values) -> None: """Updates an attribute with a variable. This method is useful to keep all entities, walks, literals and embeddings after several online training. Args: attr: The attribute to update var: The new values to add. """ if attr is None: attr = values elif isinstance(values[0], str): for i, entity in enumerate(values): if entity not in attr: attr.append(entity) else: self._pos_entities.append(attr.index(entity)) self._pos_walks.append(i) else: tmp = values for i, pos in enumerate(self._pos_entities): attr[pos] = tmp.pop(self._pos_walks[i]) attr += tmp @staticmethod def load(filename: str = "transformer_data") -> RDF2VecTransformer: """Loads a RDF2VecTransformer object. Args: filename: The binary file to load the RDF2VecTransformer object. Returns: The loaded RDF2VecTransformer. """ with open(filename, "rb") as f: transformer = pickle.load(f) if not isinstance(transformer, RDF2VecTransformer): raise ValueError( "Failed to load the RDF2VecTransformer object") return transformer