def test_determinism(self, corpus):
     w1 = (
         Word2Vec(workers=1)
         .fit(corpus)
         .transform([f"{URL}#{entity}" for entity in ROOTS_WITHOUT_URL])
     )
     w2 = (
         Word2Vec(workers=1)
         .fit(corpus)
         .transform([f"{URL}#{entity}" for entity in ROOTS_WITHOUT_URL])
     )
     assert np.array_equal(w1, w2)
 def test_transform(self, corpus):
     w2v = Word2Vec()
     w2v.fit(corpus)
     embeddings = w2v.transform(
         [f"{URL}#{entity}" for entity in ROOTS_WITHOUT_URL]
     )
     assert len(embeddings) > 0
Exemple #3
0
def create_embeddings(
    kg: KG,
    entities: List[rdflib.URIRef],
    split: int,
    walkers: Sequence[Walker],
    sg: int = 1,
) -> Tuple[List[str], List[str]]:
    """Creates embeddings for a list of entities according to a knowledge
    graphs and a walking strategy.

    Args:
        kg: The knowledge graph.
            The graph from which the neighborhoods are extracted for the
            provided instances.
        entities: The train and test instances to create the embedding.
        split: Split value for train and test embeddings.
        walker: The list of walkers strategies.
        sg: The training algorithm. 1 for skip-gram; otherwise CBOW.
            Defaults to 1.

    Returns:
        The embeddings of the provided instances.

    """
    transformer = RDF2VecTransformer(Word2Vec(sg=sg), walkers=walkers)
    walk_embeddings = transformer.fit_transform(kg, entities)
    return walk_embeddings[:split], walk_embeddings[split:]
Exemple #4
0
    def __init__(
        self,
        embedder: Optional[Embedder] = None,
        walkers: Optional[Sequence[Walker]] = None,
    ):
        if embedder is not None:
            self.embedder = embedder
        else:
            self.embedder = Word2Vec()

        if walkers is not None:
            self.walkers = walkers
        else:
            self.walkers = [RandomWalker(2, None)]
        self.walks_: List[rdflib.URIRef] = []
 def test_online_training(self, corpus):
     w2v = Word2Vec(workers=1)
     w2v.fit(corpus)
     embeddings = w2v.transform(
         [f"{URL}#{entity}" for entity in ROOTS_WITHOUT_URL]
     )
     assert len(embeddings) == 3
     corpus.append(
         [
             (
                 "http://pyRDF2Vec#Alice",
                 "http://pyRDF2Vec#knows",
                 "http://pyRDF2Vec#Casper",
                 "http://pyRDF2Vec#knows",
                 "http://pyRDF2Vec#Mario",
             )
         ]
     )
     w2v.fit(corpus, True)
     embeddings = w2v.transform(
         [f"{URL}#{entity}" for entity in ROOTS_WITHOUT_URL]
     )
     assert len(embeddings) == 3
Exemple #6
0
from sklearn.manifold import TSNE

from pyrdf2vec import RDF2VecTransformer
from pyrdf2vec.embedders import Word2Vec
from pyrdf2vec.graphs import KG
from pyrdf2vec.walkers import RandomWalker

# Ensure the determinism of this script by initializing a pseudo-random number.
RANDOM_STATE = 22

data = pd.read_csv("samples/countries-cities/entities.tsv", sep="\t")

transformer = RDF2VecTransformer(
    # Use one worker threads for Word2Vec to ensure random determinism.
    # Must be used with PYTHONHASHSEED.
    Word2Vec(workers=1),
    # Extract a maximum of 10 walks of a maximum depth of 4 for each entity
    # using two processes and use a random state to ensure that the same walks
    # are generated for the entities.
    walkers=[RandomWalker(4, 10, n_jobs=2, random_state=RANDOM_STATE)],
    verbose=1,
)

# Train and save the Word2Vec model according to the KG, the entities, and
# a walking strategy.
embeddings, _ = transformer.fit_transform(
    # Defined the DBpedia endpoint server, as well as a set of predicates to
    # exclude from this KG.
    KG(
        "https://dbpedia.org/sparql",
        skip_predicates={"www.w3.org/1999/02/22-rdf-syntax-ns#type"},
 def test_invalid_transform(self):
     with pytest.raises(ValueError):
         Word2Vec().transform(
             [f"{URL}#{entity}" for entity in ROOTS_WITHOUT_URL]
         )
 def test_fit(self, corpus, root):
     w2v = Word2Vec()
     with pytest.raises(KeyError):
         w2v._model.wv[f"{URL}#{root}"]
     w2v.fit(corpus, False)
     assert len(w2v._model.wv[f"{URL}#{root}"]) > 0
Exemple #9
0
from pyrdf2vec.graphs import KG
from pyrdf2vec.samplers import UniformSampler
from pyrdf2vec.walkers import RandomWalker, Walker
import numpy as np
import logging

logging.basicConfig(
    filename="rdf2vec.log",
    level=logging.INFO,
    format=
    '%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S')

# Define the label predicates, all triples with these predicates
# will be excluded from the graph
logging.info("Read in knowledge graph.")
label_predicates = []
kg = KG(location="data/dbp_graph.ttl", label_predicates=label_predicates)

logging.info("Create walkers and transformers.")
walkers = [RandomWalker(4, 5, UniformSampler())]
transformer = RDF2VecTransformer(Word2Vec(sg=1), walkers=walkers)

logging.info("Read in entities.")
# Entities should be a list of URIs that can be found in the Knowledge Graph
entities = list(np.load("data/entities.npy", allow_pickle=True))
logging.info("Calculate embeddings.")
embeddings = transformer.fit_transform(kg, entities)
logging.info("Write embeddings to disk.")
np.save("data/embeddings.npy", embeddings)
logging.info("Finished job.")
Exemple #10
0
test_data = pd.read_csv("samples/mutag/test.tsv", sep="\t")
train_data = pd.read_csv("samples/mutag/train.tsv", sep="\t")

train_entities = [entity for entity in train_data["bond"]]
train_labels = list(train_data["label_mutagenic"])

test_entities = [entity for entity in test_data["bond"]]
test_labels = list(test_data["label_mutagenic"])

entities = train_entities + test_entities
labels = train_labels + test_labels

embeddings, literals = RDF2VecTransformer(
    # Ensure random determinism for Word2Vec.
    # Must be used with PYTHONHASHSEED.
    Word2Vec(workers=1, epochs=10),
    # Extract all walks with a maximum depth of 2 for each entity using two
    # processes and use a random state to ensure that the same walks are
    # generated for the entities.
    walkers=[RandomWalker(2, None, n_jobs=2, random_state=RANDOM_STATE)],
    verbose=1,
).fit_transform(
    KG(
        "samples/mutag/mutag.owl",
        skip_predicates={"http://dl-learner.org/carcinogenesis#isMutagenic"},
        literals=[
            [
                "http://dl-learner.org/carcinogenesis#hasAtom",
                "http://dl-learner.org/carcinogenesis#charge",
            ],
        ],
Exemple #11
0
class RDF2VecTransformer:
    """Transforms nodes in a Knowledge Graph into an embedding.

    Attributes:
        _embeddings: All the embeddings of the model.
            Defaults to [].
        _entities: All the entities of the model.
            Defaults to [].
        _is_extract_walks_literals: True if the session must be closed after
            the call to the `transform` function. False, otherwise.
            Defaults to False.
        _literals: All the literals of the model.
            Defaults to [].
        _pos_entities: The positions of existing entities to be updated.
            Defaults to [].
        _pos_walks: The positions of existing walks to be updated.
            Defaults to [].
        _walks: All the walks of the model.
            Defaults to [].
        embedder: The embedding technique.
            Defaults to Word2Vec.
        walkers: The walking strategies.
            Defaults to [RandomWalker(2, None)]
        verbose: The verbosity level.
            0: does not display anything;
            1: display of the progress of extraction and training of walks;
            2: debugging.
            Defaults to 0.

    """

    embedder = attr.ib(
        factory=lambda: Word2Vec(),
        type=Embedder,
        validator=attr.validators.instance_of(Embedder),  # type: ignore
    )

    walkers = attr.ib(
        factory=lambda: [RandomWalker(2)],  # type: ignore
        type=Sequence[Walker],
        validator=attr.validators.deep_iterable(
            member_validator=attr.validators.instance_of(
                Walker  # type: ignore
            ),
            iterable_validator=attr.validators.instance_of(list),
        ),
    )

    verbose = attr.ib(
        kw_only=True,
        default=0,
        type=int,
        validator=attr.validators.in_([0, 1, 2]),
    )

    _is_extract_walks_literals = attr.ib(
        init=False,
        default=False,
        type=bool,
        repr=False,
        validator=attr.validators.instance_of(bool),
    )

    _embeddings = attr.ib(init=False, type=Embeddings, factory=list)
    _entities = attr.ib(init=False, type=Entities, factory=list)
    _literals = attr.ib(init=False, type=Literals, factory=list)
    _walks = attr.ib(init=False, type=List[List[SWalk]], factory=list)

    _pos_entities = attr.ib(init=False, type=List[str], factory=list)
    _pos_walks = attr.ib(init=False, type=List[int], factory=list)

    def fit(self,
            walks: List[List[SWalk]],
            is_update: bool = False) -> RDF2VecTransformer:
        """Fits the embeddings based on the provided entities.

        Args:
            walks: The walks to fit.
            is_update: True if the new corpus should be added to old model's
                corpus, False otherwise.
                Defaults to False.

        Returns:
            The RDF2VecTransformer.

        """
        if self.verbose == 2:
            print(self.embedder)

        tic = time.perf_counter()
        self.embedder.fit(walks, is_update)
        toc = time.perf_counter()

        if self.verbose >= 1:
            n_walks = sum([len(entity_walks) for entity_walks in walks])
            print(f"Fitted {n_walks} walks ({toc - tic:0.4f}s)")
            if len(self._walks) != len(walks):
                n_walks = sum(
                    [len(entity_walks) for entity_walks in self._walks])
                print(f"> {n_walks} walks extracted " +
                      f"for {len(self._entities)} entities.")
        return self

    def fit_transform(self,
                      kg: KG,
                      entities: Entities,
                      is_update: bool = False) -> Tuple[Embeddings, Literals]:
        """Creates a model and generates embeddings and literals for the
        provided entities.

        Args:
            kg: The Knowledge Graph.
            entities: The entities including test entities to create the
                embeddings. Since RDF2Vec is unsupervised, there is no label
                leakage.
            is_update: True if the new corpus should be added to old model's
                corpus, False otherwise.
                Defaults to False.

        Returns:
            The embeddings and the literals of the provided entities.

        """
        self._is_extract_walks_literals = True
        self.fit(self.get_walks(kg, entities), is_update)
        return self.transform(kg, entities)

    def get_walks(self, kg: KG, entities: Entities) -> List[List[SWalk]]:
        """Gets the walks of an entity based on a Knowledge Graph and a
        list of walkers

        Args:
            kg: The Knowledge Graph.
            entities: The entities including test entities to create the
                embeddings. Since RDF2Vec is unsupervised, there is no label
                leakage.

        Returns:
            The walks for the given entities.

        Raises:
            ValueError: If the provided entities aren't in the Knowledge Graph.

        """
        if not kg._is_remote and not all(
            [Vertex(entity) in kg._vertices for entity in entities]):
            raise ValueError(
                "The provided entities must be in the Knowledge Graph.")

        # Avoids duplicate entities for unnecessary walk extractions.
        entities = list(set(entities))

        if self.verbose == 2:
            print(kg)
            print(self.walkers[0])

        walks: List[List[SWalk]] = []
        tic = time.perf_counter()
        for walker in self.walkers:
            walks += walker.extract(kg, entities, self.verbose)
        toc = time.perf_counter()

        self._update(self._entities, entities)
        self._update(self._walks, walks)

        if self.verbose >= 1:
            n_walks = sum([len(entity_walks) for entity_walks in walks])
            print(f"Extracted {n_walks} walks " +
                  f"for {len(entities)} entities ({toc - tic:0.4f}s)")
        if (kg._is_remote and kg.mul_req
                and not self._is_extract_walks_literals):
            asyncio.run(kg.connector.close())
        return walks

    def transform(self, kg: KG,
                  entities: Entities) -> Tuple[Embeddings, Literals]:
        """Transforms the provided entities into embeddings and literals.

        Args:
            kg: The Knowledge Graph.
            entities: The entities including test entities to create the
                embeddings. Since RDF2Vec is unsupervised, there is no label
                leakage.

        Returns:
            The embeddings and the literals of the provided entities.

        """
        assert self.embedder is not None
        embeddings = self.embedder.transform(entities)

        tic = time.perf_counter()
        literals = kg.get_literals(entities, self.verbose)
        toc = time.perf_counter()

        self._update(self._embeddings, embeddings)
        if len(literals) > 0:
            self._update(self._literals, literals)

        if kg._is_remote and kg.mul_req:
            self._is_extract_walks_literals = False
            asyncio.run(kg.connector.close())

        if self.verbose >= 1 and len(literals) > 0:
            print(f"Extracted {len(literals)} literals for {len(entities)} " +
                  f"entities ({toc - tic:0.4f}s)")
        return embeddings, literals

    def save(self, filename: str = "transformer_data") -> None:
        """Saves a RDF2VecTransformer object.

        Args:
            filename: The binary file to save the RDF2VecTransformer object.

        """
        with open(filename, "wb") as f:
            pickle.dump(self, f)

    def _update(self, attr, values) -> None:
        """Updates an attribute with a variable.

        This method is useful to keep all entities, walks, literals and
        embeddings after several online training.

        Args:
            attr: The attribute to update
            var: The new values to add.

        """
        if attr is None:
            attr = values
        elif isinstance(values[0], str):
            for i, entity in enumerate(values):
                if entity not in attr:
                    attr.append(entity)
                else:
                    self._pos_entities.append(attr.index(entity))
                    self._pos_walks.append(i)
        else:
            tmp = values
            for i, pos in enumerate(self._pos_entities):
                attr[pos] = tmp.pop(self._pos_walks[i])
            attr += tmp

    @staticmethod
    def load(filename: str = "transformer_data") -> RDF2VecTransformer:
        """Loads a RDF2VecTransformer object.

        Args:
            filename: The binary file to load the RDF2VecTransformer object.

        Returns:
            The loaded RDF2VecTransformer.

        """

        with open(filename, "rb") as f:
            transformer = pickle.load(f)
            if not isinstance(transformer, RDF2VecTransformer):
                raise ValueError(
                    "Failed to load the RDF2VecTransformer object")
            return transformer