def create_embeddings( kg: KG, entities: List[rdflib.URIRef], split: int, walkers: Sequence[Walker], sg: int = 1, ) -> Tuple[List[str], List[str]]: """Creates embeddings for a list of entities according to a knowledge graphs and a walking strategy. Args: kg: The knowledge graph. The graph from which the neighborhoods are extracted for the provided instances. entities: The train and test instances to create the embedding. split: Split value for train and test embeddings. walker: The list of walkers strategies. sg: The training algorithm. 1 for skip-gram; otherwise CBOW. Defaults to 1. Returns: The embeddings of the provided instances. """ transformer = RDF2VecTransformer(Word2Vec(sg=sg), walkers=walkers) walk_embeddings = transformer.fit_transform(kg, entities) return walk_embeddings[:split], walk_embeddings[split:]
def create_embeddings(kg, entities, split, walker=WALKER, sg=1): """Creates embeddings for a list of entities according to a knowledge graphs and a walking strategy. Args: kg (graph.KnowledgeGraph): The knowledge graph. The graph from which the neighborhoods are extracted for the provided instances. entities (array-like): The train and test instances to create the embedding. split (int): Split value for train and test embeddings. walker (walkers.Walker): The walking strategy. Defaults to RandomWalker(2, float("inf)). sg (int): The training algorithm. 1 for skip-gram; otherwise CBOW. Defaults to 1. Returns: array-like: The embeddings of the provided instances. """ embedding = open("embeddingsR2V.csv","a", encoding="utf-8") embeddingId = open("embeddingsR2V_ID.csv","a", encoding="utf-8") transformer = transformer = RDF2VecTransformer(sg=sg, walkers=walker) walk_embeddings = transformer.fit_transform(kg, entities) i=0 for row in walk_embeddings: p = str(entities[i]) +',' idp = str(i) +',' embedding.write(p) embedding.write(",".join(str(item) for item in row)) embedding.write("\n") embeddingId.write(idp) embeddingId.write(",".join(str(item) for item in row)) embeddingId.write("\n") i+=1 return ( walk_embeddings[: len(train_entities)], walk_embeddings[len(train_entities) :], )
def create_embeddings(kg, entities, split, walkers, sg=1): """Creates embeddings for a list of entities according to a knowledge graphs and a walking strategy. Args: kg (graph.KnowledgeGraph): The knowledge graph. The graph from which the neighborhoods are extracted for the provided instances. entities (array-like): The train and test instances to create the embedding. split (int): Split value for train and test embeddings. walker (walkers.Walker): The walking strategy. Defaults to RandomWalker(2, float("inf)). sg (int): The training algorithm. 1 for skip-gram; otherwise CBOW. Defaults to 1. Returns: array-like: The embeddings of the provided instances. """ transformer = RDF2VecTransformer(walkers=walkers, sg=sg) walk_embeddings = transformer.fit_transform(kg, entities) return walk_embeddings[:split], walk_embeddings[split:]
from pyrdf2vec import RDF2VecTransformer from pyrdf2vec.embedders import Word2Vec from pyrdf2vec.graphs import KG from pyrdf2vec.walkers import RandomWalker # Ensure the determinism of this script by initializing a pseudo-random number. RANDOM_STATE = 22 data = pd.read_csv("samples/countries-cities/entities.tsv", sep="\t") transformer = RDF2VecTransformer( # Use one worker threads for Word2Vec to ensure random determinism. # Must be used with PYTHONHASHSEED. Word2Vec(workers=1), # Extract a maximum of 10 walks of a maximum depth of 4 for each entity # using two processes and use a random state to ensure that the same walks # are generated for the entities. walkers=[RandomWalker(4, 10, n_jobs=2, random_state=RANDOM_STATE)], verbose=1, ) # Train and save the Word2Vec model according to the KG, the entities, and # a walking strategy. embeddings, _ = transformer.fit_transform( # Defined the DBpedia endpoint server, as well as a set of predicates to # exclude from this KG. KG( "https://dbpedia.org/sparql", skip_predicates={"www.w3.org/1999/02/22-rdf-syntax-ns#type"}, literals=[ [
("Inverse PageRank Split", PageRankSampler(inverse=True, split=True)), ] print(f"Prediction of {len(test_entities)} entities:") for _, sampler in samplers: embeddings, _ = RDF2VecTransformer( # type:ignore # Use one worker threads for Word2Vec to ensure random determinism. # Must be used with PYTHONHASHSEED. Word2Vec(workers=1), # Extract a maximum of 100 walks of a maximum depth of 4 for each # entity using two processes and use a random state to ensure that the # same walks are generated for the entities. walkers=[ RandomWalker(4, 100, sampler, n_jobs=2, random_state=RANDOM_STATE) ], ).fit_transform( KG( "samples/mutag/mutag.owl", skip_predicates={ "http://dl-learner.org/carcinogenesis#isMutagenic" }, ), entities, ) train_embeddings = embeddings[:len(train_entities)] test_embeddings = embeddings[len(train_entities):] # Fit a Support Vector Machine on train embeddings and pick the best # C-parameters (regularization strength).
from pyrdf2vec.graphs import KG from pyrdf2vec.samplers import UniformSampler from pyrdf2vec.walkers import RandomWalker, Walker import numpy as np import logging logging.basicConfig( filename="rdf2vec.log", level=logging.INFO, format= '%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') # Define the label predicates, all triples with these predicates # will be excluded from the graph logging.info("Read in knowledge graph.") label_predicates = [] kg = KG(location="data/dbp_graph.ttl", label_predicates=label_predicates) logging.info("Create walkers and transformers.") walkers = [RandomWalker(4, 5, UniformSampler())] transformer = RDF2VecTransformer(Word2Vec(sg=1), walkers=walkers) logging.info("Read in entities.") # Entities should be a list of URIs that can be found in the Knowledge Graph entities = list(np.load("data/entities.npy", allow_pickle=True)) logging.info("Calculate embeddings.") embeddings = transformer.fit_transform(kg, entities) logging.info("Write embeddings to disk.") np.save("data/embeddings.npy", embeddings) logging.info("Finished job.")
test_labels = list(test_data["label_mutagenic"]) entities = train_entities + test_entities labels = train_labels + test_labels embeddings, literals = RDF2VecTransformer( # Ensure random determinism for Word2Vec. # Must be used with PYTHONHASHSEED. Word2Vec(workers=1, epochs=10), # Extract all walks with a maximum depth of 2 for each entity using two # processes and use a random state to ensure that the same walks are # generated for the entities. walkers=[RandomWalker(2, None, n_jobs=2, random_state=RANDOM_STATE)], verbose=1, ).fit_transform( KG( "samples/mutag/mutag.owl", skip_predicates={"http://dl-learner.org/carcinogenesis#isMutagenic"}, literals=[ [ "http://dl-learner.org/carcinogenesis#hasAtom", "http://dl-learner.org/carcinogenesis#charge", ], ], ), entities, ) train_embeddings = embeddings[: len(train_entities)] test_embeddings = embeddings[len(train_entities) :] print("\nWithout using literals:")
test_labels = list(test_data["label_mutagenic"]) entities = train_entities + test_entities labels = train_labels + test_labels # Defines the MUTAG KG with the predicates to be skipped. kg = KG( "samples/mutag/mutag.owl", skip_predicates={"http://dl-learner.org/carcinogenesis#isMutagenic"}, ) transformer = RDF2VecTransformer( # Ensure random determinism for Word2Vec. # Must be used with PYTHONHASHSEED. Word2Vec(workers=1), # Extract all walks with a maximum depth of 2 for each entity by using two # processes and a random state to ensure that the same walks are generated # for the entities. walkers=[RandomWalker(2, None, n_jobs=2, random_state=RANDOM_STATE)], verbose=1, ) embeddings, _ = transformer.fit_transform(kg, entities) transformer.save("mutag") train_embeddings = embeddings[:len(train_entities)] test_embeddings = embeddings[len(train_entities):] # Fit a Support Vector Machine on train embeddings and pick the best # C-parameters (regularization strength). clf = GridSearchCV(SVC(random_state=RANDOM_STATE), {"C": [10**i for i in range(-3, 4)]})
("Inverse Object Frequency", ObjFreqSampler(inverse=True)), ( "Inverse Object Frequency Split", ObjFreqSampler(inverse=True, split=True), ), ("Predicate Frequency", PredFreqSampler()), ("Inverse Predicate Frequency", PredFreqSampler(inverse=True)), ("Predicate + Object Frequency", ObjPredFreqSampler()), ("Inverse Predicate + Object Frequency", ObjPredFreqSampler(inverse=True)), ("PageRank", PageRankSampler()), ("Inverse PageRank", PageRankSampler(inverse=True)), ("PageRank Split", PageRankSampler(split=True)), ("Inverse PageRank Split", PageRankSampler(inverse=True, split=True)), ] for name, sampler in samplers: # Create embeddings with random walks transformer = RDF2VecTransformer(walkers=[RandomWalker(4, 100, sampler)]) walk_embeddings = transformer.fit_transform(kg, entities, verbose=True) # Split into train and test embeddings train_embeddings = walk_embeddings[:len(train_entities)] test_embeddings = walk_embeddings[len(train_entities):] # Fit a support vector machine on train embeddings and evaluate on test clf = SVC(random_state=42) clf.fit(train_embeddings, train_labels) print(end=f"[{name}] Support Vector Machine: Accuracy = ") print(accuracy_score(test_labels, clf.predict(test_embeddings)))
embeddings, literals = RDF2VecTransformer( # Ensure random determinism for Word2Vec. # Must be used with PYTHONHASHSEED. Word2Vec(workers=1, epochs=10), # Extract all walks with a maximum depth of 2 for each entity using two # processes and use a random state to ensure that the same walks are # generated for the entities without hashing as MUTAG is a short KG. walkers=[ HALKWalker( 2, None, n_jobs=2, sampler=WideSampler(), random_state=RANDOM_STATE, md5_bytes=None, ) ], verbose=1, ).fit_transform( KG( "samples/mutag/mutag.owl", skip_predicates={"http://dl-learner.org/carcinogenesis#isMutagenic"}, literals=[ [ "http://dl-learner.org/carcinogenesis#hasAtom", "http://dl-learner.org/carcinogenesis#charge", ], ["http://dl-learner.org/carcinogenesis#salmonella"], ["http://dl-learner.org/carcinogenesis#cytogen_sce"], ["http://dl-learner.org/carcinogenesis#cytogen_ca"], ["http://dl-learner.org/carcinogenesis#mouse_lymph"], ["http://dl-learner.org/carcinogenesis#amesTestPositive"], ], ), entities, )