Ejemplo n.º 1
0
def test_train_test_split_fast():
    X = load_fb15k_237()
    x_all = np.concatenate([X['train'], X['valid'], X['test']], 0)
    unique_entities = len(set(x_all[:, 0]).union(x_all[:, 2]))
    unique_rels = len(set(x_all[:, 1]))

    x_train, x_test = train_test_split_no_unseen(x_all, 0.90)

    assert x_train.shape[0] + x_test.shape[0] == x_all.shape[0]

    unique_entities_train = len(set(x_train[:, 0]).union(x_train[:, 2]))
    unique_rels_train = len(set(x_train[:, 1]))

    assert unique_entities_train == unique_entities and unique_rels_train == unique_rels

    with pytest.raises(Exception) as e:
        x_train, x_test = train_test_split_no_unseen(x_all, 0.99, allow_duplication=False)

    assert str(e.value) == "Cannot create a test split of the desired size. " \
                                    "Some entities will not occur in both training and test set. "  \
                                    "Set allow_duplication=True,"  \
                                    "remove filter on test predicates or "  \
                                    "set test_size to a smaller value."

    x_train, x_test = train_test_split_no_unseen(x_all, 0.99, allow_duplication=True)
    assert x_train.shape[0] + x_test.shape[0] > x_all.shape[0]

    unique_entities_train = len(set(x_train[:, 0]).union(x_train[:, 2]))
    unique_rels_train = len(set(x_train[:, 1]))

    assert unique_entities_train == unique_entities and unique_rels_train == unique_rels
Ejemplo n.º 2
0
def generate_model(X):
    X_train, X_test = train_test_split_no_unseen(X, test_size=100)

    print('Train set size: ', X_train.shape)
    print('Test set size: ', X_test.shape)

    model = ComplEx(batches_count=100,
                    seed=0,
                    epochs=10,
                    k=150,
                    eta=5,
                    optimizer='adam',
                    optimizer_params={'lr': 1e-3},
                    loss='multiclass_nll',
                    regularizer='LP',
                    regularizer_params={
                        'p': 3,
                        'lambda': 1e-5
                    },
                    verbose=True)

    #positives_filter = X

    tf.logging.set_verbosity(tf.logging.ERROR)

    model.fit(X_train, early_stopping=False)

    print("created the model")

    save_model(model, './best_model.pkl')

    return X_test
Ejemplo n.º 3
0
def kge(triples, kge_name, verbose):
    # Train test split
    t_size = math.ceil(len(triples)*0.2)
    X_train, X_test = train_test_split_no_unseen(triples, test_size=t_size)

    # Select kge_name
    if kge_name == 'complex':
        # ComplEx model
        model = ComplEx(batches_count=50,
                        epochs=300,
                        k=100,
                        eta=20,
                        optimizer='adam',
                        optimizer_params={'lr':1e-4},
                        loss='multiclass_nll',
                        regularizer='LP',
                        regularizer_params={'p':3, 'lambda':1e-5},
                        seed=0,
                        verbose=verbose)
    else:
        sys.exit('Given kge_name is not valid.')

    model.fit(X_train)

    #Embedding evaluation
    if verbose:
        filter_triples = np.concatenate((X_train, X_test))
        ranks = evaluate_performance(X_test,
                                     model=model,
                                     filter_triples=filter_triples,
                                     use_default_protocol=True,
                                     verbose=True)

        mrr = mrr_score(ranks)
        print("MRR: %.2f" % (mrr))
        mr = mr_score(ranks)
        print("MR: %.2f" % (mr))
        hits_10 = hits_at_n_score(ranks, n=10)
        print("Hits@10: %.2f" % (hits_10))
        hits_3 = hits_at_n_score(ranks, n=3)
        print("Hits@3: %.2f" % (hits_3))
        hits_1 = hits_at_n_score(ranks, n=1)
        print("Hits@1: %.2f" % (hits_1))

        print('''
        - Ampligraph example -
        MRR: 0.25
        MR: 4927.33
        Hits@10: 0.35
        Hits@3: 0.28
        Hits@1: 0.19
        ''')

    return model
Ejemplo n.º 4
0
def kge(triples, kge_name, epochs, batch_size, learning_rate, seed, verbose):
    kge_name = parsed_args.kge
    kge_model_savepath = f'./temp/ampligraph.model'

    if not os.path.isfile(kge_model_savepath):
        #Embedding evaluation
        if verbose:
            # Train test split
            t_size = math.ceil(len(triples) * 0.2)
            X_train, X_test = train_test_split_no_unseen(triples,
                                                         test_size=t_size)

            eval_model = select_kge(kge_name, batch_size, epochs, seed,
                                    verbose)

            eval_model.fit(X_train)
            filter_triples = np.concatenate((X_train, X_test))
            ranks = evaluate_performance(X_test,
                                         model=eval_model,
                                         filter_triples=filter_triples,
                                         use_default_protocol=True,
                                         verbose=True)

            mrr = mrr_score(ranks)
            print("MRR: %.2f" % (mrr))
            mr = mr_score(ranks)
            print("MR: %.2f" % (mr))
            hits_10 = hits_at_n_score(ranks, n=10)
            print("Hits@10: %.2f" % (hits_10))
            hits_3 = hits_at_n_score(ranks, n=3)
            print("Hits@3: %.2f" % (hits_3))
            hits_1 = hits_at_n_score(ranks, n=1)
            print("Hits@1: %.2f" % (hits_1))

            print('''
            - Ampligraph example -
            MRR: 0.25
            MR: 4927.33
            Hits@10: 0.35
            Hits@3: 0.28
            Hits@1: 0.19
            ''')

        model = select_kge(kge_name, batch_size, epochs, seed, verbose)

        print('Training...')
        model.fit(np.array(triples))
        save_model(model, model_name_path=kge_model_savepath)
    else:
        model = restore_model(model_name_path=kge_model_savepath)

    return model
Ejemplo n.º 5
0
def test_train_test_split():

    # Graph
    X = np.array([['a', 'y', 'b'], ['a', 'y', 'c'], ['c', 'y', 'a'],
                  ['d', 'y', 'e'], ['e', 'y', 'f'], ['f', 'y', 'c'],
                  ['f', 'y', 'c']])

    expected_X_train = np.array([['a', 'y', 'b'], ['c', 'y', 'a'],
                                 ['d', 'y', 'e'], ['e', 'y', 'f'],
                                 ['f', 'y', 'c']])

    expected_X_test = np.array([['a', 'y', 'c'], ['f', 'y', 'c']])

    X_train, X_test = train_test_split_no_unseen(X, test_size=2, seed=0)

    np.testing.assert_array_equal(X_train, expected_X_train)
    np.testing.assert_array_equal(X_test, expected_X_test)
import tensorflow as tf
ampligraph.__version__

x = pd.read_csv('triplet.csv')
x = x.values
x[:5, ]

entities = np.unique(np.concatenate([x[:, 0], x[:, 2]]))
entities

relations = np.unique(x[:, 1])
relations

from ampligraph.evaluation import train_test_split_no_unseen 

X_train_valid, X_test = train_test_split_no_unseen(x, test_size=3000)
X_train, X_valid = train_test_split_no_unseen(X_train_valid, test_size=3000)
# X_train, X_test = train_test_split_no_unseen(x, test_size=3000)

print('Train set size: ', X_train.shape)
print('Test set size: ', X_test.shape)
print('Validation set size: ', X_valid.shape)

from ampligraph.latent_features import ComplEx

model = ComplEx(batches_count=100, 
                seed=0, 
                epochs=300, 
                k=150, 
                eta=5,
                optimizer='adam', 
Ejemplo n.º 7
0
"""Relationships that link subject and object"""

relations = np.unique(X[:, 1])
relations

"""---
# 2. Defining train and test datasets
"""

from ampligraph.evaluation import train_test_split_no_unseen 

num_test = int(len(X) * (20 / 100))

data = {}
data['train'], data['test'] = train_test_split_no_unseen(X, test_size=num_test, seed=0, allow_duplication=False)

print('Train set size: ', data['train'].shape)
print('Test set size: ', data['test'].shape)

"""---
# 3. Training ComplEx model
"""

import tensorflow
print(tensorflow.__version__)

from ampligraph.latent_features import DistMult

model = DistMult(batches_count=100, 
                seed=0, 
Ejemplo n.º 8
0
  # Client and the SPARQL Endpoint
  
  endpoint = 'http://10.161.202.101:8890/sparql/'
  port = 8890
  output_format = HttpClientDataFormat.PANDAS_DF
  client = HttpClient(endpoint_url=endpoint, port=port, return_format=output_format, timeout=timeout,
                      default_graph_uri=default_graph_url, max_rows=max_rows)
  
  # Get all triples where the object is a URI
  dataset = graph.feature_domain_range(s, p, o).filter({o: ['isURI']})
  
  # execute 
  df = dataset.execute(client, return_format=output_format)
    
  # Train/test split and create ComplEx model from ampligraph library
  
  triples = df.to_numpy()
  X_train, X_test = train_test_split_no_unseen(triples, test_size=10000)
  
  # use ComplEx model to build the embedding 
  model = ComplEx(batches_count=50,epochs=300,k=100,eta=20, optimizer='adam',optimizer_params={'lr':1e-4}, 
          loss='multiclass_nll',regularizer='LP', regularizer_params={'p':3, 'lambda':1e-5}, seed=0,verbose=True)
  model.fit(X_train)
  
  # Evaluate the embedding model
  filter_triples = np.concatenate((X_train, X_test))
  ranks = evaluate_performance(X_test, model=model, filter_triples=filter_triples,
                                use_default_protocol=True, verbose=True)
  mr  = mr_score(ranks)
  mrr = mrr_score(ranks)
Ejemplo n.º 9
0
    tournament = (row["match_id"], "inTournament", row["tournament_id"])
    city = (row["match_id"], "inCity", row["city_id"])
    country = (row["match_id"], "inCountry", row["country_id"])
    neutral = (row["match_id"], "isNeutral", row["neutral"])
    year = (row["match_id"], "atYear", row["date"][:4])

    triples.extend((home_team, away_team, score_home, score_away, tournament,
                    city, country, neutral, year, home_score, away_score))

triples_df = pd.DataFrame(triples, columns=["subject", "predicate", "object"])
triples_df[(triples_df.subject == "Match3129") |
           (triples_df.object == "Match3129")]

from ampligraph.evaluation import train_test_split_no_unseen

X_train, X_valid = train_test_split_no_unseen(np.array(triples),
                                              test_size=10000)

print('Train set size: ', X_train.shape)
print('Test set size: ', X_valid.shape)

from ampligraph.latent_features import ComplEx

import os
from ampligraph.utils import save_model, restore_model

import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
from ampligraph.evaluation import evaluate_performance

ke_model_path = "./models/football_ke.amplimodel"
if not os.path.isfile(ke_model_path):
Ejemplo n.º 10
0
def split(graph: iter, train_portion: float = 0.8):
    n_test_samples = int(len(graph) * (1 - train_portion))
    return train_test_split_no_unseen(graph,
                                      test_size=n_test_samples,
                                      seed=0,
                                      allow_duplication=False)
from ampligraph.evaluation import train_test_split_no_unseen
from ampligraph.latent_features import ComplEx
import tensorflow as tf
from ampligraph.evaluation import evaluate_performance
from ampligraph.utils import save_model, restore_model
from ampligraph.evaluation import mr_score, mrr_score, hits_at_n_score

if __name__ == "__main__":
    graph_file = "sample_Game_KG.nt"
    out_embeddings_file = "Game_KGE_model.pkl"
    embedding_dims = 100
    num_epochs = 300

    X = load_from_rdf('.', graph_file, rdf_format='nt', data_home='.')
    test_size = int(0.05 * X.shape[0])
    X_train, X_test = train_test_split_no_unseen(X, test_size=test_size)

    print('Train set size: ', X_train.shape)
    print('Test set size: ', X_test.shape)

    model = ComplEx(batches_count=100,
                    seed=0,
                    epochs=num_epochs,
                    k=embedding_dims,
                    eta=5,
                    optimizer='adam',
                    optimizer_params={'lr': 1e-3},
                    loss='multiclass_nll',
                    regularizer='LP',
                    regularizer_params={
                        'p': 3,