Beispiel #1
0
 def __init__(
     self,
     depth: int,
     walks_per_graph: float,
     sampler: Sampler = None,
 ):
     self.depth = depth
     self.walks_per_graph = walks_per_graph
     if sampler is not None:
         self.sampler = sampler
     else:
         self.sampler = UniformSampler()
Beispiel #2
0
 def __init__(
         self,
         depth: int,
         walks_per_graph: float,
         sampler: Sampler = UniformSampler(),
 ):
     super().__init__(depth, walks_per_graph, sampler)
Beispiel #3
0
 def __init__(
         self,
         depth: int,
         max_walks: Optional[int] = None,
         sampler: Sampler = UniformSampler(),
         n_jobs: int = 1,
 ):
     super().__init__(depth, max_walks, sampler, n_jobs)
Beispiel #4
0
 def __init__(
     self,
     depth: int,
     walks_per_graph: float,
     sampler: Sampler = UniformSampler(),
     freq_thresholds: List[float] = [0.001],
 ):
     super().__init__(depth, walks_per_graph, sampler)
     self.freq_thresholds = freq_thresholds
Beispiel #5
0
 def __init__(
     self,
     depth: int,
     max_walks: Optional[int] = None,
     sampler: Sampler = UniformSampler(),
     n_jobs: int = 1,
     is_support_remote: bool = True,
 ):
     super().__init__(depth, max_walks, sampler, n_jobs, is_support_remote)
 def __init__(
     self,
     depth: int,
     max_walks: Optional[int] = None,
     sampler: Sampler = UniformSampler(),
     wl_iterations: int = 4,
     n_jobs: int = 1,
 ):
     super().__init__(depth, max_walks, sampler, n_jobs, False)
     self.wl_iterations = wl_iterations
Beispiel #7
0
 def __init__(
     self,
     depth: int,
     max_walks: Optional[int] = None,
     sampler: Sampler = UniformSampler(),
     freq_thresholds: List[float] = [0.001],
     n_jobs: int = 1,
 ):
     super().__init__(depth, max_walks, sampler, n_jobs)
     self.freq_thresholds = freq_thresholds
Beispiel #8
0
 def __init__(
     self,
     depth: int,
     max_walks: Optional[int] = None,
     sampler: Optional[Sampler] = None,
     n_jobs: int = 1,
     is_support_remote: bool = True,
 ):
     self.depth = depth
     self.is_support_remote = is_support_remote
     if n_jobs == -1:
         self.n_jobs = multiprocessing.cpu_count()
     else:
         self.n_jobs = n_jobs
     self.max_walks = max_walks
     if sampler is not None:
         self.sampler = sampler
     else:
         self.sampler = UniformSampler()
Beispiel #9
0
 def __init__(
     self,
     depth: int,
     walks_per_graph: float,
     sampler: Sampler = UniformSampler(),
     hop_prob: float = 0.1,
     resolution: int = 1,
 ):
     super().__init__(depth, walks_per_graph, sampler)
     self.hop_prob = hop_prob
     self.resolution = resolution
Beispiel #10
0
 def __init__(
     self,
     depth: int,
     max_walks: Optional[int] = None,
     sampler: Sampler = UniformSampler(),
     hop_prob: float = 0.1,
     resolution: int = 1,
     n_jobs: int = 1,
 ):
     super().__init__(depth, max_walks, sampler, n_jobs, False)
     self.hop_prob = hop_prob
     self.resolution = resolution
Beispiel #11
0
 def __init__(
     self,
     depth: int,
     walks_per_graph: float,
     sampler: Sampler = UniformSampler(),
     grams: int = 3,
     wildcards: list = None,
 ):
     super().__init__(depth, walks_per_graph, sampler)
     self.grams = grams
     self.n_gram_map = {}  # type: Dict[Tuple, str]
     self.wildcards = wildcards
Beispiel #12
0
 def __init__(
     self,
     embedder: Optional[Embedder] = None,
     walkers: Optional[Sequence[Walker]] = None,
 ):
     if embedder is not None:
         self.embedder = embedder
     else:
         self.embedder = Word2Vec()
     self.walks_: List[rdflib.URIRef] = []
     if walkers is not None:
         self.walkers = walkers
     else:
         self.walkers = [
             RandomWalker(2, None, UniformSampler(inverse=False))
         ]
Beispiel #13
0
def check_walker(name, Walker):
    walks_per_graph = 5
    depth = 2

    canonical_walks = Walker(depth, walks_per_graph,
                             UniformSampler()).extract(KNOWLEDGE_GRAPH,
                                                       ENTITIES_SUBSET)
    assert type(canonical_walks) == set

    if name == "WeisfeilerLehmanWalker":
        assert len(canonical_walks) <= len(
            ENTITIES_SUBSET * walks_per_graph * 5)

    # Sometimes, WalkletWalker returns one more walks than the ones specified.
    # We need to fix that.
    elif name == "WalkletWalker":
        assert len(canonical_walks) <= len(ENTITIES_SUBSET * walks_per_graph *
                                           (depth + 1))
    else:
        assert len(canonical_walks) <= len(ENTITIES_SUBSET * walks_per_graph)
Beispiel #14
0
class Walker(metaclass=abc.ABCMeta):
    """Base class for the walking strategies.

    Attributes:
        depth: The depth per entity.
        max_walks: The maximum number of walks per entity.
        sampler: The sampling strategy.
            Defaults to UniformSampler().
        n_jobs: The number of processes to use for multiprocessing. Use -1 to
            allocate as many processes as there are CPU cores available in the
            machine.
            Defaults to 1.
        is_support_remote: If true, indicate that the walking strategy can be
            used to retrieve walks via a SPARQL endpoint server.
            Defaults to False.

    """

    # Global KG used later on for the worker process.
    kg = None

    def __init__(
        self,
        depth: int,
        max_walks: Optional[int] = None,
        sampler: Optional[Sampler] = None,
        n_jobs: int = 1,
        is_support_remote: bool = True,
    ):
        self.depth = depth
        self.is_support_remote = is_support_remote
        if n_jobs == -1:
            self.n_jobs = multiprocessing.cpu_count()
        else:
            self.n_jobs = n_jobs
        self.max_walks = max_walks
        if sampler is not None:
            self.sampler = sampler
        else:
            self.sampler = UniformSampler()

    def extract(
        self, kg: KG, instances: List[rdflib.URIRef], verbose=False
    ) -> Set[Tuple[Any, ...]]:
        """Fits the provided sampling strategy and then calls the
        private _extract method that is implemented for each of the
        walking strategies.

        Args:
            kg: The Knowledge Graph.

                The graph from which the neighborhoods are extracted for the
                provided instances.
            instances: The instances to be extracted from the Knowledge Graph.
            verbose: If true, display a progress bar for the extraction of the
                walks.

        Returns:
            The 2D matrix with its number of rows equal to the number of
            provided instances; number of column equal to the embedding size.

        """
        if kg.is_remote and not self.is_support_remote:
            raise RemoteNotSupported(
                "Invalid walking strategy. Please, choose a walking strategy "
                + "that can retrieve walks via a SPARQL endpoint server."
            )
        self.sampler.fit(kg)
        canonical_walks = set()

        # To avoid circular imports
        if "CommunityWalker" in str(self):
            self._community_detection(kg)  # type: ignore

        if kg.is_remote:
            asyncio.run(kg._fill_entity_hops(instances))  # type: ignore

        with multiprocessing.Pool(
            self.n_jobs, self._init_worker, [kg]
        ) as pool:
            res = list(
                tqdm(
                    pool.imap_unordered(self._proc, instances),
                    total=len(instances),
                    disable=not verbose,
                )
            )
        res = {k: v for elm in res for k, v in elm.items()}  # type: ignore

        for instance in instances:
            canonical_walks.update(res[instance])
        return canonical_walks

    @abc.abstractmethod
    def _extract(
        self, kg: KG, instance: rdflib.URIRef
    ) -> Dict[Any, Tuple[Tuple[str, ...], ...]]:
        """Extracts walks rooted at the provided instances which are then each
        transformed into a numerical representation.

        Args:
            kg: The Knowledge Graph.

                The graph from which the neighborhoods are extracted for the
                provided instances.
            instance: The instance to be extracted from the Knowledge Graph.

        Returns:
            The 2D matrix with its number of rows equal to the number of
            provided instances; number of column equal to the embedding size.

        """
        raise NotImplementedError("This must be implemented!")

    def _init_worker(self, init_kg):
        """Initializes each worker process.

        Args:
            init_kg: The Knowledge Graph to provide to each worker process.

        """
        global kg
        kg = init_kg

    def info(self):
        """Gets informations related to a Walker.

        Returns:
            A friendly display of the Walker.

        """
        return (
            f"{type(self).__name__}(depth={self.depth},"
            + f"max_walks={self.max_walks},"
            + f"sampler={type(self.sampler).__name__},"
            + f"n_jobs={self.n_jobs},"
            + f"is_support_remote={self.is_support_remote})"
        )

    def print_walks(
        self,
        kg: KG,
        instances: List[rdflib.URIRef],
        filename: str,
    ) -> None:
        """Prints the walks of a Knowledge Graph.

        Args:
            kg: The Knowledge Graph.

                The graph from which the neighborhoods are extracted for the
                provided instances.
            instances: The instances to be extracted from the Knowledge Graph.
            filename: The filename that contains the rdflib.Graph

        """
        walks = self.extract(kg, instances)
        walk_strs = []
        for _, walk in enumerate(walks):
            s = ""
            for i in range(len(walk)):
                s += f"{walk[i]} "
                if i < len(walk) - 1:
                    s += "--> "
            walk_strs.append(s)

        with open(filename, "w+") as f:
            for s in walk_strs:
                f.write(s)
                f.write("\n\n")

    def _proc(
        self, instance: rdflib.URIRef
    ) -> Dict[Any, Tuple[Tuple[str, ...], ...]]:
        """Executed by each process.

        Args:
            instance: The instance to be extracted from the Knowledge Graph.

        Returns:
            The extraction of walk by the process.

        """
        global kg
        return self._extract(kg, instance)  # type:ignore
Beispiel #15
0
# Ensure the determinism of this script by initializing a pseudo-random number.
RANDOM_STATE = 22

test_data = pd.read_csv("samples/mutag/test.tsv", sep="\t")
train_data = pd.read_csv("samples/mutag/train.tsv", sep="\t")

train_entities = [entity for entity in train_data["bond"]]
train_labels = list(train_data["label_mutagenic"])

test_entities = [entity for entity in test_data["bond"]]
test_labels = list(test_data["label_mutagenic"])

entities = train_entities + test_entities

samplers = [
    ("Uniform", UniformSampler()),
    ("Object Frequency", ObjFreqSampler()),
    ("Inverse Object Frequency", ObjFreqSampler(inverse=True)),
    (
        "Inverse Object Frequency Split",
        ObjFreqSampler(inverse=True, split=True),
    ),
    ("Predicate Frequency", PredFreqSampler()),
    ("Inverse Predicate Frequency", PredFreqSampler(inverse=True)),
    ("Predicate + Object Frequency", ObjPredFreqSampler()),
    ("Inverse Predicate + Object Frequency", ObjPredFreqSampler(inverse=True)),
    ("PageRank", PageRankSampler()),
    ("Inverse PageRank", PageRankSampler(inverse=True)),
    ("PageRank Split", PageRankSampler(split=True)),
    ("Inverse PageRank Split", PageRankSampler(inverse=True, split=True)),
]
Beispiel #16
0
from sklearn.svm import SVC

from pyrdf2vec import RDF2VecTransformer
from pyrdf2vec.embedders import Word2Vec
from pyrdf2vec.graphs import KG
from pyrdf2vec.samplers import UniformSampler
from pyrdf2vec.walkers import RandomWalker, Walker

DATASET = {
    "test": ["samples/mutag/test.tsv", "bond", "label_mutagenic"],
    "train": ["samples/mutag/train.tsv", "bond", "label_mutagenic"],
}
LABEL_PREDICATES = {"http://dl-learner.org/carcinogenesis#isMutagenic"}
OUTPUT = "samples/mutag/mutag.owl"
# We'll extract all possible walks of depth 4 (2 hops)
WALKERS = [RandomWalker(2, None, UniformSampler(inverse=False))]

PLOT_SAVE = "embeddings.png"
PLOT_TITLE = "pyRDF2Vec"

warnings.filterwarnings("ignore")

np.random.seed(42)
random.seed(42)


def create_embeddings(
    kg: KG,
    entities: List[rdflib.URIRef],
    split: int,
    walkers: Sequence[Walker],
Beispiel #17
0
from pyrdf2vec.graphs import KG
from pyrdf2vec.samplers import UniformSampler
from pyrdf2vec.walkers import RandomWalker, Walker
import numpy as np
import logging

logging.basicConfig(
    filename="rdf2vec.log",
    level=logging.INFO,
    format=
    '%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S')

# Define the label predicates, all triples with these predicates
# will be excluded from the graph
logging.info("Read in knowledge graph.")
label_predicates = []
kg = KG(location="data/dbp_graph.ttl", label_predicates=label_predicates)

logging.info("Create walkers and transformers.")
walkers = [RandomWalker(4, 5, UniformSampler())]
transformer = RDF2VecTransformer(Word2Vec(sg=1), walkers=walkers)

logging.info("Read in entities.")
# Entities should be a list of URIs that can be found in the Knowledge Graph
entities = list(np.load("data/entities.npy", allow_pickle=True))
logging.info("Calculate embeddings.")
embeddings = transformer.fit_transform(kg, entities)
logging.info("Write embeddings to disk.")
np.save("data/embeddings.npy", embeddings)
logging.info("Finished job.")
Beispiel #18
0
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.svm import SVC

from pyrdf2vec import RDF2VecTransformer
from pyrdf2vec.embedders import Word2Vec
from pyrdf2vec.graphs import KG
from pyrdf2vec.samplers import UniformSampler
from pyrdf2vec.walkers import RandomWalker, Walker


DATASET = {
    "train": ["samples/products.csv", "product"],
}
LABEL_PREDICATES = ["http://dice-researcher.com/grocery-recommendation/recommendation#list"]
OUTPUT = "samples/dataset.owl"
WALKER = [RandomWalker(500, 4, UniformSampler())]

PLOT_SAVE = "embeddings-new.png"
PLOT_TITLE = "pyRDF2Vec"

warnings.filterwarnings("ignore")

def create_embeddings(kg, entities, split, walker=WALKER, sg=1):
    """Creates embeddings for a list of entities according to a knowledge
    graphs and a walking strategy.

    Args:
        kg (graph.KnowledgeGraph): The knowledge graph.
            The graph from which the neighborhoods are extracted for the
            provided instances.
        entities (array-like): The train and test instances to create the
Beispiel #19
0
 def test_fit(self):
     UniformSampler().fit(None)
Beispiel #20
0
class Walker(metaclass=abc.ABCMeta):
    """Base class for the walking strategies.

    Attributes:
        depth: The depth per entity.
        walks_per_graph: The maximum number of walks per entity.
        sampler: The sampling strategy.
            Default to UniformSampler().

    """

    def __init__(
        self,
        depth: int,
        walks_per_graph: float,
        sampler: Sampler = None,
    ):
        self.depth = depth
        self.walks_per_graph = walks_per_graph
        if sampler is not None:
            self.sampler = sampler
        else:
            self.sampler = UniformSampler()

    def extract(
        self, kg: KG, instances: List[rdflib.URIRef]
    ) -> Set[Tuple[Any, ...]]:
        """Fits the provided sampling strategy and then calls the
        private _extract method that is implemented for each of the
        walking strategies.

        Args:
            graph: The knowledge graph.

                The graph from which the neighborhoods are extracted for the
                provided instances.
            instances: The instances to extract the knowledge graph.

        Returns:
            The 2D matrix with its number of rows equal to the number of
            provided instances; number of column equal to the embedding size.

        """
        self.sampler.fit(kg)
        return self._extract(kg, instances)

    @abc.abstractmethod
    def _extract(
        self, kg: KG, instances: List[rdflib.URIRef]
    ) -> Set[Tuple[Any, ...]]:
        """Extracts walks rooted at the provided instances which are then each
        transformed into a numerical representation.

        Args:
            graph: The knowledge graph.

                The graph from which the neighborhoods are extracted for the
                provided instances.
            instances: The instances to extract the knowledge graph.

        Returns:
            The 2D matrix with its number of rows equal to the number of
            provided instances; number of column equal to the embedding size.

        """
        raise NotImplementedError("This must be implemented!")

    def print_walks(
        self,
        kg: KG,
        instances: List[rdflib.URIRef],
        file_name: str,
    ) -> None:
        """Prints the walks of a knowledge graph.

        Args:
            kg: The knowledge graph.

                The graph from which the neighborhoods are extracted for the
                provided instances.
            instances: The instances to extract the knowledge graph.
            file_name: The filename that contains the rdflib.Graph

        """
        walks = self.extract(kg, instances)
        walk_strs = []
        for _, walk in enumerate(walks):
            s = ""
            for i in range(len(walk)):
                s += f"{walk[i]} "
                if i < len(walk) - 1:
                    s += "--> "
            walk_strs.append(s)

        with open(file_name, "w+") as f:
            for s in walk_strs:
                f.write(s)
                f.write("\n\n")
Beispiel #21
0
 def test_weight(self, setup, kg, root, is_reverse):
     sampler = UniformSampler()
     for hop in kg.get_hops(Vertex(f"{URL}#{root}"), is_reverse=is_reverse):
         assert sampler.get_weight(hop) == 1
Beispiel #22
0
class Walker(ABC):
    """Base class of the walking strategies.

    Attributes:
        _is_support_remote: True if the walking strategy can be used with a
            remote Knowledge Graph, False Otherwise
            Defaults to True.
        kg: The global KG used later on for the worker process.
            Defaults to None.
        max_depth: The maximum depth of one walk.
        max_walks: The maximum number of walks per entity.
            Defaults to None.
        random_state: The random state to use to keep random determinism with
            the walking strategy.
            Defaults to None.
        sampler: The sampling strategy.
            Defaults to UniformSampler.
        with_reverse: True to extracts parents and children hops from an
            entity, creating (max_walks * max_walks) walks of 2 * depth,
            allowing also to centralize this entity in the walks. False
            otherwise. This doesn't work with NGramWalker and WLWalker.
            Defaults to False.

    """

    kg = attr.ib(init=False, repr=False, type=Optional[KG], default=None)

    max_depth = attr.ib(
        type=int,
        validator=[attr.validators.instance_of(int), _check_max_depth],
    )

    max_walks = attr.ib(  # type: ignore
        default=None,
        type=Optional[int],
        validator=[
            attr.validators.optional(attr.validators.instance_of(int)),
            _check_max_walks,
        ],
    )

    sampler = attr.ib(
        factory=lambda: UniformSampler(),
        type=Sampler,
        validator=attr.validators.instance_of(Sampler),  # type: ignore
    )

    n_jobs = attr.ib(  # type: ignore
        default=None,
        type=Optional[int],
        validator=[
            attr.validators.optional(attr.validators.instance_of(int)),
            _check_jobs,
        ],
    )

    with_reverse = attr.ib(
        kw_only=True,
        type=Optional[bool],
        default=False,
        validator=attr.validators.instance_of(bool),
    )

    random_state = attr.ib(
        kw_only=True,
        type=Optional[int],
        default=None,
        validator=attr.validators.optional(attr.validators.instance_of(int)),
    )

    _is_support_remote = attr.ib(init=False,
                                 repr=False,
                                 type=bool,
                                 default=True)

    def __attrs_post_init__(self):
        if self.n_jobs == -1:
            self.n_jobs = multiprocessing.cpu_count()
        self.sampler.random_state = self.random_state

    def extract(self,
                kg: KG,
                entities: Entities,
                verbose: int = 0) -> List[List[SWalk]]:
        """Fits the provided sampling strategy and then calls the
        private _extract method that is implemented for each of the
        walking strategies.

        Args:
            kg: The Knowledge Graph.
            entities: The entities to be extracted from the Knowledge Graph.
            verbose: The verbosity level.
                0: does not display anything;
                1: display of the progress of extraction and training of walks;
                2: debugging.
                Defaults to 0.

        Returns:
            The 2D matrix with its number of rows equal to the number of
            provided entities; number of column equal to the embedding size.

        Raises:
            WalkerNotSupported: If there is an attempt to use an invalid
                walking strategy to a remote Knowledge Graph.

        """
        if kg._is_remote and not self._is_support_remote:
            raise WalkerNotSupported(
                "Invalid walking strategy. Please, choose a walking strategy "
                + "that can fetch walks via a SPARQL endpoint server.")
        self.sampler.fit(kg)

        process = self.n_jobs if self.n_jobs is not None else 1
        if (kg._is_remote and kg.mul_req) and process >= 2:
            warnings.warn(
                "Using 'mul_req=True' and/or 'n_jobs>=2' speed up the " +
                "extraction of entity's walks, but may violate the policy " +
                "of some SPARQL endpoint servers.",
                category=RuntimeWarning,
                stacklevel=2,
            )

        if kg._is_remote and kg.mul_req:
            kg._fill_hops(entities)

        with multiprocessing.Pool(process, self._init_worker, [kg]) as pool:
            res = list(
                tqdm(
                    pool.imap(self._proc, entities),
                    total=len(entities),
                    disable=True if verbose == 0 else False,
                ))
        return self._post_extract(res)

    @abstractmethod
    def _extract(self, kg: KG, entity: Vertex) -> EntityWalks:
        """Extracts random walks for an entity based on a Knowledge Graph.

        Args:
            kg: The Knowledge Graph.
            entity: The root node to extract walks.

        Returns:
            A dictionary having the entity as key and a list of tuples as value
            corresponding to the extracted walks.

        Raises:
            NotImplementedError: If this method is called, without having
                provided an implementation.

        """
        raise NotImplementedError("This must be implemented!")

    def _init_worker(self, init_kg: KG) -> None:
        """Initializes each worker process.

        Args:
            init_kg: The Knowledge Graph to provide to each worker process.

        """
        global kg
        kg = init_kg  # type: ignore

    def _post_extract(self, res: List[EntityWalks]) -> List[List[SWalk]]:
        """Post processed walks.

        Args:
            res: the result of the walks extracted with multiprocessing.

        Returns:
            The 2D matrix with its number of rows equal to the number of
            provided entities; number of column equal to the embedding size.

        """
        return list(walks for entity_to_walks in res
                    for walks in entity_to_walks.values())

    def _proc(self, entity: str) -> EntityWalks:
        """Executed by each process.

        Args:
            entity: The entity to be extracted from the Knowledge Graph.

        Returns:
            The extraction of walk by the process.

        """
        global kg
        return self._extract(kg, Vertex(entity))  # type: ignore