Ejemplo n.º 1
0
def test_plt_exact_prediction_reproducibility():
    X_train, Y_train = load_dataset(TEST_DATASET, "train", root=TEST_DATA_PATH)
    X_test, Y_test = load_dataset(TEST_DATASET, "test", root=TEST_DATA_PATH)

    print("\n")
    for mc in model_configs:
        print("model config: ", mc)
        plt = PLT(MODEL_PATH, **mc)
        plt.fit(X_train, Y_train)
        Y_pred = plt.predict(X_test, top_k=1)
        p_at_1 = precision_at_k(Y_test, Y_pred, k=1)

        for rc in representation_configs:
            print("  prediction config: ", rc)
            for _ in range(repeat):
                plt = PLT(MODEL_PATH, **mc, **rc)
                Y_pred = plt.predict(X_test, top_k=1)
                assert p_at_1 == precision_at_k(Y_test, Y_pred, k=1)

        shutil.rmtree(MODEL_PATH, ignore_errors=True)
Ejemplo n.º 2
0
def _test_model(model_class, model_config):
    X_train, Y_train = load_dataset(TEST_DATASET, "train", root=TEST_DATA_PATH)
    X_test, Y_test = load_dataset(TEST_DATASET, "test", root=TEST_DATA_PATH)

    model = model_class(MODEL_PATH, seed=TEST_SEED, **model_config)
    model.fit(X_train, Y_train)

    Y_pred = model.predict(X_test, top_k=1)
    p_at_1 = precision_at_k(Y_test, Y_pred, k=1)

    assert SCORE_RANGE[0] < p_at_1 < SCORE_RANGE[1]

    shutil.rmtree(MODEL_PATH, ignore_errors=True)
Ejemplo n.º 3
0
def test_seed_reproducibility():
    X_train, Y_train = load_dataset(TEST_DATASET, "train", root=TEST_DATA_PATH)
    X_test, Y_test = load_dataset(TEST_DATASET, "test", root=TEST_DATA_PATH)

    for i in range(repeat):
        plt_1 = PLT(MODEL_PATH + "-1", optimizer="adagrad", epochs=1, loss="log", seed=i)
        plt_1.fit(X_train, Y_train)
        Y_pred_1 = plt_1.predict(X_test, top_k=1)
        p_at_1_1 = precision_at_k(Y_test, Y_pred_1, k=1)
        tree_structure_1 = plt_1.get_tree_structure()

        plt_2 = PLT(MODEL_PATH + "-2", optimizer="adagrad", epochs=1, loss="log", seed=i)
        plt_2.fit(X_train, Y_train)
        Y_pred_2 = plt_2.predict(X_test, top_k=1)
        p_at_1_2 = precision_at_k(Y_test, Y_pred_2, k=1)
        tree_structure_2 = plt_2.get_tree_structure()

        assert len(set(tree_structure_1) - set(tree_structure_2)) == 0
        assert p_at_1_1 == p_at_1_2

        shutil.rmtree(MODEL_PATH + "-1", ignore_errors=True)
        shutil.rmtree(MODEL_PATH + "-2", ignore_errors=True)
Ejemplo n.º 4
0
def eval_data_for_dataset(X_train, Y_train, model_class):
    print_data_info(X_train)
    print_data_info(Y_train)
        
    model = model_class(MODEL_PATH, optimizer="adagrad", epochs=1, tree_type="huffman")

    t_start = time()
    model.fit(X_train, Y_train)
    print(f"Train time: {time() - t_start}")

    X_test, Y_test = load_dataset(TEST_DATASET, "test", root=TEST_DATA_PATH)

    t_start = time()
    Y_pred = model.predict(X_test, top_k=1)
    print(f"Predict time: {time() - t_start}")

    p_at_1 = precision_at_k(Y_test, Y_pred, k=1)
    print(f"P@1: {p_at_1}")
    assert 0.25 < p_at_1 < SCORE_RANGE[1]

    shutil.rmtree(MODEL_PATH, ignore_errors=True)
Ejemplo n.º 5
0
# from XML Repository (http://manikvarma.org/downloads/XC/XMLRepository.html).
X_train, Y_train = load_dataset("eurlex-4k", "train")
X_test, Y_test = load_dataset("eurlex-4k", "test")

# Create Probabilistic Labels Tree model,
# directory "eurlex-model" will be created and used during model training.
# napkinXC stores already trained parts of the model to save RAM.
# Model directory is only a required argument for model constructors.
plt = PLT("eurlex-model")

# Fit the model on the training dataset.
# The model weights and additional data will be stored in "eurlex-model" directory.
# Features matrix X must be SciPy csr_matrix, NumPy array, or list of tuples of (idx, value),
# while labels matrix Y should be list of lists or tuples containing positive labels.
plt.fit(X_train, Y_train)

# After the training model is not loaded to RAM.
# You can preload the model to RAM to perform prediction.
plt.load()

# Predict only five top labels for each data point in the test dataset.
# This will also load the model if it is not loaded.
Y_pred = plt.predict(X_test, top_k=5)

# Evaluate the prediction with precision at 5 measure.
print("Precision at k:", precision_at_k(Y_test, Y_pred, k=5))

# Unload the model from RAM
# You can also just delete the object if you do not need it
plt.unload()
Ejemplo n.º 6
0
# The beginning is the same as in the basic.py example.

# Use load_dataset function to load one of the benchmark datasets
# from XML Repository (http://manikvarma.org/downloads/XC/XMLRepository.html).
X_train, Y_train = load_dataset("eurlex-4k", "train")
X_test, Y_test = load_dataset("eurlex-4k", "test")

# Create PLT model with "eurlex-model" directory,
# it will be created and used during model training for storing weights.
# napkinXC stores already trained parts of the models to save RAM.
plt = PLT("eurlex-model")

# Fit the model on the training dataset.
# The model weights and additional data will be stored in "eurlex-model" directory.
plt.fit(X_train, Y_train)

# Predict.
Y_pred = plt.predict(X_test, top_k=1)
print("Precision at 1:", precision_at_k(Y_test, Y_pred, k=1))

# Delete plt object.
del plt

# To load the model, create a new PLT object with the same directory as the previous one.
new_plt = PLT("eurlex-model")

# Predict using a new model object.
Y_pred = new_plt.predict(X_test, top_k=1)
print("Precision at 1 after loading:", precision_at_k(Y_test, Y_pred, k=1))
Ejemplo n.º 7
0
from napkinxc.datasets import load_dataset
from napkinxc.models import PLT
from napkinxc.measures import precision_at_k

X_train, Y_train = load_dataset("eurlex-4k", "train")
X_test, Y_test = load_dataset("eurlex-4k", "test")
plt = PLT("output/eurlex", tree_type='randomHyperplane', arity=2, seed=1234)
# plt = PLT("output/eurlex", tree_type='hierarchicalKmeans', arity=2, seed=1234)
# plt = PLT("output/eurlex", tree_type='balancedRandom', arity=2, seed=1234)
plt.fit(X_train, Y_train)
Y_pred = plt.predict(X_test, top_k=1)
print(precision_at_k(Y_test, Y_pred, k=1))