コード例 #1
0
ファイル: mutag.py プロジェクト: moissinac/pyRDF2Vec
def create_embeddings(
    kg: KG,
    entities: List[rdflib.URIRef],
    split: int,
    walkers: Sequence[Walker],
    sg: int = 1,
) -> Tuple[List[str], List[str]]:
    """Creates embeddings for a list of entities according to a knowledge
    graphs and a walking strategy.

    Args:
        kg: The knowledge graph.
            The graph from which the neighborhoods are extracted for the
            provided instances.
        entities: The train and test instances to create the embedding.
        split: Split value for train and test embeddings.
        walker: The list of walkers strategies.
        sg: The training algorithm. 1 for skip-gram; otherwise CBOW.
            Defaults to 1.

    Returns:
        The embeddings of the provided instances.

    """
    transformer = RDF2VecTransformer(Word2Vec(sg=sg), walkers=walkers)
    walk_embeddings = transformer.fit_transform(kg, entities)
    return walk_embeddings[:split], walk_embeddings[split:]
コード例 #2
0
ファイル: lidl.py プロジェクト: Scharfi/KG-preprocessing
def create_embeddings(kg, entities, split, walker=WALKER, sg=1):
    """Creates embeddings for a list of entities according to a knowledge
    graphs and a walking strategy.

    Args:
        kg (graph.KnowledgeGraph): The knowledge graph.
            The graph from which the neighborhoods are extracted for the
            provided instances.
        entities (array-like): The train and test instances to create the
            embedding.
        split (int): Split value for train and test embeddings.
        walker (walkers.Walker): The walking strategy.
            Defaults to RandomWalker(2, float("inf)).
        sg (int): The training algorithm. 1 for skip-gram; otherwise CBOW.
            Defaults to 1.

    Returns:
        array-like: The embeddings of the provided instances.

    """
    embedding = open("embeddingsR2V.csv","a", encoding="utf-8")
    embeddingId = open("embeddingsR2V_ID.csv","a", encoding="utf-8")
    transformer =  transformer = RDF2VecTransformer(sg=sg, walkers=walker)
    walk_embeddings = transformer.fit_transform(kg, entities)
    i=0
    for row in walk_embeddings:
        p = str(entities[i]) +','
        idp = str(i) +','
        embedding.write(p)
        embedding.write(",".join(str(item) for item in row))
        embedding.write("\n")
        embeddingId.write(idp)
        embeddingId.write(",".join(str(item) for item in row))
        embeddingId.write("\n")
        i+=1
    return (
        walk_embeddings[: len(train_entities)],
        walk_embeddings[len(train_entities) :],
    )
コード例 #3
0
ファイル: example.py プロジェクト: wayne9qiu/pyRDF2Vec
def create_embeddings(kg, entities, split, walkers, sg=1):
    """Creates embeddings for a list of entities according to a knowledge
    graphs and a walking strategy.

    Args:
        kg (graph.KnowledgeGraph): The knowledge graph.
            The graph from which the neighborhoods are extracted for the
            provided instances.
        entities (array-like): The train and test instances to create the
            embedding.
        split (int): Split value for train and test embeddings.
        walker (walkers.Walker): The walking strategy.
            Defaults to RandomWalker(2, float("inf)).
        sg (int): The training algorithm. 1 for skip-gram; otherwise CBOW.
            Defaults to 1.

    Returns:
        array-like: The embeddings of the provided instances.

    """
    transformer = RDF2VecTransformer(walkers=walkers, sg=sg)
    walk_embeddings = transformer.fit_transform(kg, entities)
    return walk_embeddings[:split], walk_embeddings[split:]
コード例 #4
0
ファイル: countries.py プロジェクト: Qawasmeh-omar/pyRDF2Vec
from pyrdf2vec import RDF2VecTransformer
from pyrdf2vec.embedders import Word2Vec
from pyrdf2vec.graphs import KG
from pyrdf2vec.walkers import RandomWalker

# Ensure the determinism of this script by initializing a pseudo-random number.
RANDOM_STATE = 22

data = pd.read_csv("samples/countries-cities/entities.tsv", sep="\t")

transformer = RDF2VecTransformer(
    # Use one worker threads for Word2Vec to ensure random determinism.
    # Must be used with PYTHONHASHSEED.
    Word2Vec(workers=1),
    # Extract a maximum of 10 walks of a maximum depth of 4 for each entity
    # using two processes and use a random state to ensure that the same walks
    # are generated for the entities.
    walkers=[RandomWalker(4, 10, n_jobs=2, random_state=RANDOM_STATE)],
    verbose=1,
)

# Train and save the Word2Vec model according to the KG, the entities, and
# a walking strategy.
embeddings, _ = transformer.fit_transform(
    # Defined the DBpedia endpoint server, as well as a set of predicates to
    # exclude from this KG.
    KG(
        "https://dbpedia.org/sparql",
        skip_predicates={"www.w3.org/1999/02/22-rdf-syntax-ns#type"},
        literals=[
            [
コード例 #5
0
ファイル: samplers.py プロジェクト: Qawasmeh-omar/pyRDF2Vec
    ("Inverse PageRank Split", PageRankSampler(inverse=True, split=True)),
]

print(f"Prediction of {len(test_entities)} entities:")

for _, sampler in samplers:
    embeddings, _ = RDF2VecTransformer(  # type:ignore
        # Use one worker threads for Word2Vec to ensure random determinism.
        # Must be used with PYTHONHASHSEED.
        Word2Vec(workers=1),
        # Extract a maximum of 100 walks of a maximum depth of 4 for each
        # entity using two processes and use a random state to ensure that the
        # same walks are generated for the entities.
        walkers=[
            RandomWalker(4, 100, sampler, n_jobs=2, random_state=RANDOM_STATE)
        ],
    ).fit_transform(
        KG(
            "samples/mutag/mutag.owl",
            skip_predicates={
                "http://dl-learner.org/carcinogenesis#isMutagenic"
            },
        ),
        entities,
    )

    train_embeddings = embeddings[:len(train_entities)]
    test_embeddings = embeddings[len(train_entities):]

    # Fit a Support Vector Machine on train embeddings and pick the best
    # C-parameters (regularization strength).
コード例 #6
0
from pyrdf2vec.graphs import KG
from pyrdf2vec.samplers import UniformSampler
from pyrdf2vec.walkers import RandomWalker, Walker
import numpy as np
import logging

logging.basicConfig(
    filename="rdf2vec.log",
    level=logging.INFO,
    format=
    '%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S')

# Define the label predicates, all triples with these predicates
# will be excluded from the graph
logging.info("Read in knowledge graph.")
label_predicates = []
kg = KG(location="data/dbp_graph.ttl", label_predicates=label_predicates)

logging.info("Create walkers and transformers.")
walkers = [RandomWalker(4, 5, UniformSampler())]
transformer = RDF2VecTransformer(Word2Vec(sg=1), walkers=walkers)

logging.info("Read in entities.")
# Entities should be a list of URIs that can be found in the Knowledge Graph
entities = list(np.load("data/entities.npy", allow_pickle=True))
logging.info("Calculate embeddings.")
embeddings = transformer.fit_transform(kg, entities)
logging.info("Write embeddings to disk.")
np.save("data/embeddings.npy", embeddings)
logging.info("Finished job.")
コード例 #7
0
ファイル: literals.py プロジェクト: Qawasmeh-omar/pyRDF2Vec
test_labels = list(test_data["label_mutagenic"])

entities = train_entities + test_entities
labels = train_labels + test_labels

embeddings, literals = RDF2VecTransformer(
    # Ensure random determinism for Word2Vec.
    # Must be used with PYTHONHASHSEED.
    Word2Vec(workers=1, epochs=10),
    # Extract all walks with a maximum depth of 2 for each entity using two
    # processes and use a random state to ensure that the same walks are
    # generated for the entities.
    walkers=[RandomWalker(2, None, n_jobs=2, random_state=RANDOM_STATE)],
    verbose=1,
).fit_transform(
    KG(
        "samples/mutag/mutag.owl",
        skip_predicates={"http://dl-learner.org/carcinogenesis#isMutagenic"},
        literals=[
            [
                "http://dl-learner.org/carcinogenesis#hasAtom",
                "http://dl-learner.org/carcinogenesis#charge",
            ],
        ],
    ),
    entities,
)

train_embeddings = embeddings[: len(train_entities)]
test_embeddings = embeddings[len(train_entities) :]

print("\nWithout using literals:")
コード例 #8
0
test_labels = list(test_data["label_mutagenic"])

entities = train_entities + test_entities
labels = train_labels + test_labels

# Defines the MUTAG KG with the predicates to be skipped.
kg = KG(
    "samples/mutag/mutag.owl",
    skip_predicates={"http://dl-learner.org/carcinogenesis#isMutagenic"},
)

transformer = RDF2VecTransformer(
    # Ensure random determinism for Word2Vec.
    # Must be used with PYTHONHASHSEED.
    Word2Vec(workers=1),
    # Extract all walks with a maximum depth of 2 for each entity by using two
    # processes and a random state to ensure that the same walks are generated
    # for the entities.
    walkers=[RandomWalker(2, None, n_jobs=2, random_state=RANDOM_STATE)],
    verbose=1,
)
embeddings, _ = transformer.fit_transform(kg, entities)

transformer.save("mutag")

train_embeddings = embeddings[:len(train_entities)]
test_embeddings = embeddings[len(train_entities):]

# Fit a Support Vector Machine on train embeddings and pick the best
# C-parameters (regularization strength).
clf = GridSearchCV(SVC(random_state=RANDOM_STATE),
                   {"C": [10**i for i in range(-3, 4)]})
コード例 #9
0
    ("Inverse Object Frequency", ObjFreqSampler(inverse=True)),
    (
        "Inverse Object Frequency Split",
        ObjFreqSampler(inverse=True, split=True),
    ),
    ("Predicate Frequency", PredFreqSampler()),
    ("Inverse Predicate Frequency", PredFreqSampler(inverse=True)),
    ("Predicate + Object Frequency", ObjPredFreqSampler()),
    ("Inverse Predicate + Object Frequency", ObjPredFreqSampler(inverse=True)),
    ("PageRank", PageRankSampler()),
    ("Inverse PageRank", PageRankSampler(inverse=True)),
    ("PageRank Split", PageRankSampler(split=True)),
    ("Inverse PageRank Split", PageRankSampler(inverse=True, split=True)),
]

for name, sampler in samplers:
    # Create embeddings with random walks
    transformer = RDF2VecTransformer(walkers=[RandomWalker(4, 100, sampler)])
    walk_embeddings = transformer.fit_transform(kg, entities, verbose=True)

    # Split into train and test embeddings
    train_embeddings = walk_embeddings[:len(train_entities)]
    test_embeddings = walk_embeddings[len(train_entities):]

    # Fit a support vector machine on train embeddings and evaluate on test
    clf = SVC(random_state=42)
    clf.fit(train_embeddings, train_labels)

    print(end=f"[{name}] Support Vector Machine: Accuracy = ")
    print(accuracy_score(test_labels, clf.predict(test_embeddings)))
コード例 #10
0
ファイル: literals.py プロジェクト: IBCNServices/pyRDF2Vec
embeddings, literals = RDF2VecTransformer(
    # Ensure random determinism for Word2Vec.
    # Must be used with PYTHONHASHSEED.
    Word2Vec(workers=1, epochs=10),
    # Extract all walks with a maximum depth of 2 for each entity using two
    # processes and use a random state to ensure that the same walks are
    # generated for the entities without hashing as MUTAG is a short KG.
    walkers=[
        HALKWalker(
            2,
            None,
            n_jobs=2,
            sampler=WideSampler(),
            random_state=RANDOM_STATE,
            md5_bytes=None,
        )
    ],
    verbose=1,
).fit_transform(
    KG(
        "samples/mutag/mutag.owl",
        skip_predicates={"http://dl-learner.org/carcinogenesis#isMutagenic"},
        literals=[
            [
                "http://dl-learner.org/carcinogenesis#hasAtom",
                "http://dl-learner.org/carcinogenesis#charge",
            ],
            ["http://dl-learner.org/carcinogenesis#salmonella"],
            ["http://dl-learner.org/carcinogenesis#cytogen_sce"],
            ["http://dl-learner.org/carcinogenesis#cytogen_ca"],
            ["http://dl-learner.org/carcinogenesis#mouse_lymph"],
            ["http://dl-learner.org/carcinogenesis#amesTestPositive"],
        ],
    ),
    entities,
)