Esempio n. 1
0
def check_sampler(Sampler):
    walks_per_graph = 5
    canonical_walks = RandomWalker(2, walks_per_graph, Sampler()).extract(
        KNOWLEDGE_GRAPH, ENTITIES_SUBSET
    )
    assert type(canonical_walks) == set
    assert len(canonical_walks) <= len(ENTITIES_SUBSET * walks_per_graph)
Esempio n. 2
0
 def test_load_save_transformer(self):
     RDF2VecTransformer(
         walkers=[RandomWalker(2, None),
                  WeisfeilerLehmanWalker(2, 2)]).save()
     transformer = RDF2VecTransformer.load()
     assert len(transformer.walkers) == 2
     assert isinstance(transformer.walkers[0], RandomWalker)
     assert isinstance(transformer.walkers[1], WeisfeilerLehmanWalker)
     os.remove("transformer_data")
Esempio n. 3
0
 def test_dfs(self, setup, kg, root, max_depth, max_walks, is_reverse):
     root = f"{URL}#{root}"
     for walk in RandomWalker(max_depth, max_walks, random_state=42)._dfs(
         kg, Vertex(root), is_reverse
     ):
         assert len(walk) <= (max_depth * 2) + 1
         if is_reverse:
             assert walk[-1].name == root
         else:
             assert walk[0].name == root
Esempio n. 4
0
    def __init__(
        self,
        embedder: Optional[Embedder] = None,
        walkers: Optional[Sequence[Walker]] = None,
    ):
        if embedder is not None:
            self.embedder = embedder
        else:
            self.embedder = Word2Vec()

        if walkers is not None:
            self.walkers = walkers
        else:
            self.walkers = [RandomWalker(2, None)]
        self.walks_: List[rdflib.URIRef] = []
Esempio n. 5
0
 def __init__(
     self,
     vector_size=500,
     walkers=[RandomWalker(2, float("inf"))],
     n_jobs=1,
     window=5,
     sg=1,
     max_iter=10,
     negative=25,
     min_count=1,
 ):
     self.max_iter = max_iter
     self.min_count = min_count
     self.n_jobs = n_jobs
     self.negative = negative
     self.sg = sg
     self.vector_size = vector_size
     self.walkers = walkers
     self.window = window
Esempio n. 6
0
 def test_extract(
     self, setup, kg, root, max_depth, max_walks, with_reverse
 ):
     root = f"{URL}#{root}"
     walks = RandomWalker(
         max_depth, max_walks, with_reverse=with_reverse, random_state=42
     )._extract(kg, Vertex(root))[root]
     if max_walks is not None:
         if with_reverse:
             assert len(walks) <= max_walks * max_walks
         else:
             assert len(walks) <= max_walks
     for walk in walks:
         for obj in walk[2::2]:
             assert obj.startswith("b'")
         if not with_reverse:
             assert walk[0] == root
             assert len(walk) <= (max_depth * 2) + 1
         else:
             assert len(walk) <= ((max_depth * 2) + 1) * 2
Esempio n. 7
0
from sklearn.svm import SVC

from pyrdf2vec import RDF2VecTransformer
from pyrdf2vec.embedders import Word2Vec
from pyrdf2vec.graphs import KG
from pyrdf2vec.samplers import UniformSampler
from pyrdf2vec.walkers import RandomWalker, Walker

DATASET = {
    "test": ["samples/mutag/test.tsv", "bond", "label_mutagenic"],
    "train": ["samples/mutag/train.tsv", "bond", "label_mutagenic"],
}
LABEL_PREDICATES = {"http://dl-learner.org/carcinogenesis#isMutagenic"}
OUTPUT = "samples/mutag/mutag.owl"
# We'll extract all possible walks of depth 4 (2 hops)
WALKERS = [RandomWalker(2, None, UniformSampler(inverse=False))]

PLOT_SAVE = "embeddings.png"
PLOT_TITLE = "pyRDF2Vec"

warnings.filterwarnings("ignore")

np.random.seed(42)
random.seed(42)


def create_embeddings(
    kg: KG,
    entities: List[rdflib.URIRef],
    split: int,
    walkers: Sequence[Walker],
Esempio n. 8
0
from pyrdf2vec.graphs import KG
from pyrdf2vec.samplers import UniformSampler
from pyrdf2vec.walkers import RandomWalker, Walker
import numpy as np
import logging

logging.basicConfig(
    filename="rdf2vec.log",
    level=logging.INFO,
    format=
    '%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S')

# Define the label predicates, all triples with these predicates
# will be excluded from the graph
logging.info("Read in knowledge graph.")
label_predicates = []
kg = KG(location="data/dbp_graph.ttl", label_predicates=label_predicates)

logging.info("Create walkers and transformers.")
walkers = [RandomWalker(4, 5, UniformSampler())]
transformer = RDF2VecTransformer(Word2Vec(sg=1), walkers=walkers)

logging.info("Read in entities.")
# Entities should be a list of URIs that can be found in the Knowledge Graph
entities = list(np.load("data/entities.npy", allow_pickle=True))
logging.info("Calculate embeddings.")
embeddings = transformer.fit_transform(kg, entities)
logging.info("Write embeddings to disk.")
np.save("data/embeddings.npy", embeddings)
logging.info("Finished job.")
Esempio n. 9
0
train_labels = list(train_data["label_mutagenic"])

test_entities = [entity for entity in test_data["bond"]]
test_labels = list(test_data["label_mutagenic"])

entities = train_entities + test_entities
labels = train_labels + test_labels

embeddings, literals = RDF2VecTransformer(
    # Ensure random determinism for Word2Vec.
    # Must be used with PYTHONHASHSEED.
    Word2Vec(workers=1, epochs=10),
    # Extract all walks with a maximum depth of 2 for each entity using two
    # processes and use a random state to ensure that the same walks are
    # generated for the entities.
    walkers=[RandomWalker(2, None, n_jobs=2, random_state=RANDOM_STATE)],
    verbose=1,
).fit_transform(
    KG(
        "samples/mutag/mutag.owl",
        skip_predicates={"http://dl-learner.org/carcinogenesis#isMutagenic"},
        literals=[
            [
                "http://dl-learner.org/carcinogenesis#hasAtom",
                "http://dl-learner.org/carcinogenesis#charge",
            ],
        ],
    ),
    entities,
)
Esempio n. 10
0
    ("PageRank Split", PageRankSampler(split=True)),
    ("Inverse PageRank Split", PageRankSampler(inverse=True, split=True)),
]

print(f"Prediction of {len(test_entities)} entities:")

for _, sampler in samplers:
    embeddings, _ = RDF2VecTransformer(  # type:ignore
        # Use one worker threads for Word2Vec to ensure random determinism.
        # Must be used with PYTHONHASHSEED.
        Word2Vec(workers=1),
        # Extract a maximum of 100 walks of a maximum depth of 4 for each
        # entity using two processes and use a random state to ensure that the
        # same walks are generated for the entities.
        walkers=[
            RandomWalker(4, 100, sampler, n_jobs=2, random_state=RANDOM_STATE)
        ],
    ).fit_transform(
        KG(
            "samples/mutag/mutag.owl",
            skip_predicates={
                "http://dl-learner.org/carcinogenesis#isMutagenic"
            },
        ),
        entities,
    )

    train_embeddings = embeddings[:len(train_entities)]
    test_embeddings = embeddings[len(train_entities):]

    # Fit a Support Vector Machine on train embeddings and pick the best
Esempio n. 11
0
from sklearn.svm import SVC

from pyrdf2vec import RDF2VecTransformer
from pyrdf2vec.embedders import Word2Vec
from pyrdf2vec.graphs import KG
from pyrdf2vec.walkers import RandomWalker, Walker

DATASET = {
    "test": ["samples/mutag/test.tsv", "bond", "label_mutagenic"],
    "train": ["samples/mutag/train.tsv", "bond", "label_mutagenic"],
}
LABEL_PREDICATES = {"http://dl-learner.org/carcinogenesis#isMutagenic"}
OUTPUT = "samples/mutag/mutag.owl"

# We'll extract all possible walks of depth 2 with 4 processes.
WALKERS = [RandomWalker(2, None, n_jobs=4)]
# We'll extract all possible walks of depth 2 (without multi-processing)
# WALKERS = [RandomWalker(2, None)]

PLOT_SAVE = "embeddings.png"
PLOT_TITLE = "pyRDF2Vec"

warnings.filterwarnings("ignore")

np.random.seed(42)
random.seed(42)


def create_embeddings(
    kg: KG,
    entities: List[rdflib.URIRef],
Esempio n. 12
0
    ("Inverse Object Frequency", ObjFreqSampler(inverse=True)),
    (
        "Inverse Object Frequency Split",
        ObjFreqSampler(inverse=True, split=True),
    ),
    ("Predicate Frequency", PredFreqSampler()),
    ("Inverse Predicate Frequency", PredFreqSampler(inverse=True)),
    ("Predicate + Object Frequency", ObjPredFreqSampler()),
    ("Inverse Predicate + Object Frequency", ObjPredFreqSampler(inverse=True)),
    ("PageRank", PageRankSampler()),
    ("Inverse PageRank", PageRankSampler(inverse=True)),
    ("PageRank Split", PageRankSampler(split=True)),
    ("Inverse PageRank Split", PageRankSampler(inverse=True, split=True)),
]

for name, sampler in samplers:
    # Create embeddings with random walks
    transformer = RDF2VecTransformer(walkers=[RandomWalker(4, 100, sampler)])
    walk_embeddings = transformer.fit_transform(kg, entities, verbose=True)

    # Split into train and test embeddings
    train_embeddings = walk_embeddings[:len(train_entities)]
    test_embeddings = walk_embeddings[len(train_entities):]

    # Fit a support vector machine on train embeddings and evaluate on test
    clf = SVC(random_state=42)
    clf.fit(train_embeddings, train_labels)

    print(end=f"[{name}] Support Vector Machine: Accuracy = ")
    print(accuracy_score(test_labels, clf.predict(test_embeddings)))
Esempio n. 13
0
class RDF2VecTransformer:
    """Transforms nodes in a Knowledge Graph into an embedding.

    Attributes:
        _embeddings: All the embeddings of the model.
            Defaults to [].
        _entities: All the entities of the model.
            Defaults to [].
        _is_extract_walks_literals: True if the session must be closed after
            the call to the `transform` function. False, otherwise.
            Defaults to False.
        _literals: All the literals of the model.
            Defaults to [].
        _pos_entities: The positions of existing entities to be updated.
            Defaults to [].
        _pos_walks: The positions of existing walks to be updated.
            Defaults to [].
        _walks: All the walks of the model.
            Defaults to [].
        embedder: The embedding technique.
            Defaults to Word2Vec.
        walkers: The walking strategies.
            Defaults to [RandomWalker(2, None)]
        verbose: The verbosity level.
            0: does not display anything;
            1: display of the progress of extraction and training of walks;
            2: debugging.
            Defaults to 0.

    """

    embedder = attr.ib(
        factory=lambda: Word2Vec(),
        type=Embedder,
        validator=attr.validators.instance_of(Embedder),  # type: ignore
    )

    walkers = attr.ib(
        factory=lambda: [RandomWalker(2)],  # type: ignore
        type=Sequence[Walker],
        validator=attr.validators.deep_iterable(
            member_validator=attr.validators.instance_of(
                Walker  # type: ignore
            ),
            iterable_validator=attr.validators.instance_of(list),
        ),
    )

    verbose = attr.ib(
        kw_only=True,
        default=0,
        type=int,
        validator=attr.validators.in_([0, 1, 2]),
    )

    _is_extract_walks_literals = attr.ib(
        init=False,
        default=False,
        type=bool,
        repr=False,
        validator=attr.validators.instance_of(bool),
    )

    _embeddings = attr.ib(init=False, type=Embeddings, factory=list)
    _entities = attr.ib(init=False, type=Entities, factory=list)
    _literals = attr.ib(init=False, type=Literals, factory=list)
    _walks = attr.ib(init=False, type=List[List[SWalk]], factory=list)

    _pos_entities = attr.ib(init=False, type=List[str], factory=list)
    _pos_walks = attr.ib(init=False, type=List[int], factory=list)

    def fit(self,
            walks: List[List[SWalk]],
            is_update: bool = False) -> RDF2VecTransformer:
        """Fits the embeddings based on the provided entities.

        Args:
            walks: The walks to fit.
            is_update: True if the new corpus should be added to old model's
                corpus, False otherwise.
                Defaults to False.

        Returns:
            The RDF2VecTransformer.

        """
        if self.verbose == 2:
            print(self.embedder)

        tic = time.perf_counter()
        self.embedder.fit(walks, is_update)
        toc = time.perf_counter()

        if self.verbose >= 1:
            n_walks = sum([len(entity_walks) for entity_walks in walks])
            print(f"Fitted {n_walks} walks ({toc - tic:0.4f}s)")
            if len(self._walks) != len(walks):
                n_walks = sum(
                    [len(entity_walks) for entity_walks in self._walks])
                print(f"> {n_walks} walks extracted " +
                      f"for {len(self._entities)} entities.")
        return self

    def fit_transform(self,
                      kg: KG,
                      entities: Entities,
                      is_update: bool = False) -> Tuple[Embeddings, Literals]:
        """Creates a model and generates embeddings and literals for the
        provided entities.

        Args:
            kg: The Knowledge Graph.
            entities: The entities including test entities to create the
                embeddings. Since RDF2Vec is unsupervised, there is no label
                leakage.
            is_update: True if the new corpus should be added to old model's
                corpus, False otherwise.
                Defaults to False.

        Returns:
            The embeddings and the literals of the provided entities.

        """
        self._is_extract_walks_literals = True
        self.fit(self.get_walks(kg, entities), is_update)
        return self.transform(kg, entities)

    def get_walks(self, kg: KG, entities: Entities) -> List[List[SWalk]]:
        """Gets the walks of an entity based on a Knowledge Graph and a
        list of walkers

        Args:
            kg: The Knowledge Graph.
            entities: The entities including test entities to create the
                embeddings. Since RDF2Vec is unsupervised, there is no label
                leakage.

        Returns:
            The walks for the given entities.

        Raises:
            ValueError: If the provided entities aren't in the Knowledge Graph.

        """
        if not kg._is_remote and not all(
            [Vertex(entity) in kg._vertices for entity in entities]):
            raise ValueError(
                "The provided entities must be in the Knowledge Graph.")

        # Avoids duplicate entities for unnecessary walk extractions.
        entities = list(set(entities))

        if self.verbose == 2:
            print(kg)
            print(self.walkers[0])

        walks: List[List[SWalk]] = []
        tic = time.perf_counter()
        for walker in self.walkers:
            walks += walker.extract(kg, entities, self.verbose)
        toc = time.perf_counter()

        self._update(self._entities, entities)
        self._update(self._walks, walks)

        if self.verbose >= 1:
            n_walks = sum([len(entity_walks) for entity_walks in walks])
            print(f"Extracted {n_walks} walks " +
                  f"for {len(entities)} entities ({toc - tic:0.4f}s)")
        if (kg._is_remote and kg.mul_req
                and not self._is_extract_walks_literals):
            asyncio.run(kg.connector.close())
        return walks

    def transform(self, kg: KG,
                  entities: Entities) -> Tuple[Embeddings, Literals]:
        """Transforms the provided entities into embeddings and literals.

        Args:
            kg: The Knowledge Graph.
            entities: The entities including test entities to create the
                embeddings. Since RDF2Vec is unsupervised, there is no label
                leakage.

        Returns:
            The embeddings and the literals of the provided entities.

        """
        assert self.embedder is not None
        embeddings = self.embedder.transform(entities)

        tic = time.perf_counter()
        literals = kg.get_literals(entities, self.verbose)
        toc = time.perf_counter()

        self._update(self._embeddings, embeddings)
        if len(literals) > 0:
            self._update(self._literals, literals)

        if kg._is_remote and kg.mul_req:
            self._is_extract_walks_literals = False
            asyncio.run(kg.connector.close())

        if self.verbose >= 1 and len(literals) > 0:
            print(f"Extracted {len(literals)} literals for {len(entities)} " +
                  f"entities ({toc - tic:0.4f}s)")
        return embeddings, literals

    def save(self, filename: str = "transformer_data") -> None:
        """Saves a RDF2VecTransformer object.

        Args:
            filename: The binary file to save the RDF2VecTransformer object.

        """
        with open(filename, "wb") as f:
            pickle.dump(self, f)

    def _update(self, attr, values) -> None:
        """Updates an attribute with a variable.

        This method is useful to keep all entities, walks, literals and
        embeddings after several online training.

        Args:
            attr: The attribute to update
            var: The new values to add.

        """
        if attr is None:
            attr = values
        elif isinstance(values[0], str):
            for i, entity in enumerate(values):
                if entity not in attr:
                    attr.append(entity)
                else:
                    self._pos_entities.append(attr.index(entity))
                    self._pos_walks.append(i)
        else:
            tmp = values
            for i, pos in enumerate(self._pos_entities):
                attr[pos] = tmp.pop(self._pos_walks[i])
            attr += tmp

    @staticmethod
    def load(filename: str = "transformer_data") -> RDF2VecTransformer:
        """Loads a RDF2VecTransformer object.

        Args:
            filename: The binary file to load the RDF2VecTransformer object.

        Returns:
            The loaded RDF2VecTransformer.

        """

        with open(filename, "rb") as f:
            transformer = pickle.load(f)
            if not isinstance(transformer, RDF2VecTransformer):
                raise ValueError(
                    "Failed to load the RDF2VecTransformer object")
            return transformer
Esempio n. 14
0
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.svm import SVC

from pyrdf2vec import RDF2VecTransformer
from pyrdf2vec.embedders import Word2Vec
from pyrdf2vec.graphs import KG
from pyrdf2vec.samplers import UniformSampler
from pyrdf2vec.walkers import RandomWalker, Walker


DATASET = {
    "train": ["samples/products.csv", "product"],
}
LABEL_PREDICATES = ["http://dice-researcher.com/grocery-recommendation/recommendation#list"]
OUTPUT = "samples/dataset.owl"
WALKER = [RandomWalker(500, 4, UniformSampler())]

PLOT_SAVE = "embeddings-new.png"
PLOT_TITLE = "pyRDF2Vec"

warnings.filterwarnings("ignore")

def create_embeddings(kg, entities, split, walker=WALKER, sg=1):
    """Creates embeddings for a list of entities according to a knowledge
    graphs and a walking strategy.

    Args:
        kg (graph.KnowledgeGraph): The knowledge graph.
            The graph from which the neighborhoods are extracted for the
            provided instances.
        entities (array-like): The train and test instances to create the
Esempio n. 15
0
import rdflib
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.svm import SVC

from pyrdf2vec import RDF2VecTransformer
from pyrdf2vec.converters import rdflib_to_kg
from pyrdf2vec.walkers import RandomWalker

DATASET = {
    "test": ["samples/mutag-test.tsv", "bond", "label_mutagenic"],
    "train": ["samples/mutag-train.tsv", "bond", "label_mutagenic"],
}
LABEL_PREDICATES = ["http://dl-learner.org/carcinogenesis#isMutagenic"]
OUTPUT = "samples/mutag.owl"
WALKERS = [RandomWalker(4, float("inf"))]

PLOT_SAVE = "embeddings.png"
PLOT_TITLE = "pyRDF2Vec"

warnings.filterwarnings("ignore")


def create_embeddings(kg, entities, split, walkers, sg=1):
    """Creates embeddings for a list of entities according to a knowledge
    graphs and a walking strategy.

    Args:
        kg (graph.KnowledgeGraph): The knowledge graph.
            The graph from which the neighborhoods are extracted for the
            provided instances.
Esempio n. 16
0
from pyrdf2vec import RDF2VecTransformer
from pyrdf2vec.embedders import Word2Vec
from pyrdf2vec.graphs import KG
from pyrdf2vec.walkers import RandomWalker, Walker

warnings.filterwarnings("ignore")

np.random.seed(42)
random.seed(42)

FILE = "samples/countries-cities/entities.tsv"
SPARQL_ENDPOINT = "https://dbpedia.org/sparql"
LABEL_PREDICATES = {"www.w3.org/1999/02/22-rdf-syntax-ns#type"}

# We'll extract all possible walks of depth 4 (with 25 hops)
WALKERS = [RandomWalker(4, 25)]
# We'll extract all possible walks of depth 4 (with 25 hops) with
# multi-processing. Using multi-processing improves the speed of
# extraction of walks, but this may conflict with the policy of the SPARQL
# endpoint server.
# WALKERS = [RandomWalker(4, 25, n_jobs=2)]

PLOT_TITLE = "pyRDF2Vec"


def create_embeddings(
    kg: KG,
    entities: List[rdflib.URIRef],
    walkers: Sequence[Walker],
    sg: int = 1,
) -> List[str]:
Esempio n. 17
0
 def test_extract_random_walks(self):
     walks = RandomWalker(4, float("inf")).extract_random_walks(
         KG, Vertex(str(generate_entities())))
     assert type(walks) == list
Esempio n. 18
0
from pyrdf2vec.graphs import KG
from pyrdf2vec.walkers import RandomWalker

# Ensure the determinism of this script by initializing a pseudo-random number.
RANDOM_STATE = 22

data = pd.read_csv("samples/countries-cities/entities.tsv", sep="\t")

transformer = RDF2VecTransformer(
    # Use one worker threads for Word2Vec to ensure random determinism.
    # Must be used with PYTHONHASHSEED.
    Word2Vec(workers=1),
    # Extract a maximum of 10 walks of a maximum depth of 4 for each entity
    # using two processes and use a random state to ensure that the same walks
    # are generated for the entities.
    walkers=[RandomWalker(4, 10, n_jobs=2, random_state=RANDOM_STATE)],
    verbose=1,
)

# Train and save the Word2Vec model according to the KG, the entities, and
# a walking strategy.
embeddings, _ = transformer.fit_transform(
    # Defined the DBpedia endpoint server, as well as a set of predicates to
    # exclude from this KG.
    KG(
        "https://dbpedia.org/sparql",
        skip_predicates={"www.w3.org/1999/02/22-rdf-syntax-ns#type"},
        literals=[
            [
                "http://dbpedia.org/ontology/wikiPageWikiLink",
                "http://www.w3.org/2004/02/skos/core#prefLabel",
Esempio n. 19
0
 def test_extract(self):
     canonical_walks = RandomWalker(4, float("inf")).extract(
         KG, generate_entities())
     assert type(canonical_walks) == set