def test_plt_exact_prediction_reproducibility(): X_train, Y_train = load_dataset(TEST_DATASET, "train", root=TEST_DATA_PATH) X_test, Y_test = load_dataset(TEST_DATASET, "test", root=TEST_DATA_PATH) print("\n") for mc in model_configs: print("model config: ", mc) plt = PLT(MODEL_PATH, **mc) plt.fit(X_train, Y_train) Y_pred = plt.predict(X_test, top_k=1) p_at_1 = precision_at_k(Y_test, Y_pred, k=1) for rc in representation_configs: print(" prediction config: ", rc) for _ in range(repeat): plt = PLT(MODEL_PATH, **mc, **rc) Y_pred = plt.predict(X_test, top_k=1) assert p_at_1 == precision_at_k(Y_test, Y_pred, k=1) shutil.rmtree(MODEL_PATH, ignore_errors=True)
def test_seed_reproducibility(): X_train, Y_train = load_dataset(TEST_DATASET, "train", root=TEST_DATA_PATH) X_test, Y_test = load_dataset(TEST_DATASET, "test", root=TEST_DATA_PATH) for i in range(repeat): plt_1 = PLT(MODEL_PATH + "-1", optimizer="adagrad", epochs=1, loss="log", seed=i) plt_1.fit(X_train, Y_train) Y_pred_1 = plt_1.predict(X_test, top_k=1) p_at_1_1 = precision_at_k(Y_test, Y_pred_1, k=1) tree_structure_1 = plt_1.get_tree_structure() plt_2 = PLT(MODEL_PATH + "-2", optimizer="adagrad", epochs=1, loss="log", seed=i) plt_2.fit(X_train, Y_train) Y_pred_2 = plt_2.predict(X_test, top_k=1) p_at_1_2 = precision_at_k(Y_test, Y_pred_2, k=1) tree_structure_2 = plt_2.get_tree_structure() assert len(set(tree_structure_1) - set(tree_structure_2)) == 0 assert p_at_1_1 == p_at_1_2 shutil.rmtree(MODEL_PATH + "-1", ignore_errors=True) shutil.rmtree(MODEL_PATH + "-2", ignore_errors=True)
# from XML Repository (http://manikvarma.org/downloads/XC/XMLRepository.html). X_train, Y_train = load_dataset("eurlex-4k", "train") X_test, Y_test = load_dataset("eurlex-4k", "test") # Create Probabilistic Labels Tree model, # directory "eurlex-model" will be created and used during model training. # napkinXC stores already trained parts of the model to save RAM. # Model directory is only a required argument for model constructors. plt = PLT("eurlex-model") # Fit the model on the training dataset. # The model weights and additional data will be stored in "eurlex-model" directory. # Features matrix X must be SciPy csr_matrix, NumPy array, or list of tuples of (idx, value), # while labels matrix Y should be list of lists or tuples containing positive labels. plt.fit(X_train, Y_train) # After the training model is not loaded to RAM. # You can preload the model to RAM to perform prediction. plt.load() # Predict only five top labels for each data point in the test dataset. # This will also load the model if it is not loaded. Y_pred = plt.predict(X_test, top_k=5) # Evaluate the prediction with precision at 5 measure. print("Precision at k:", precision_at_k(Y_test, Y_pred, k=5)) # Unload the model from RAM # You can also just delete the object if you do not need it plt.unload()
# napkinXC stores already trained parts of the model to save RAM. # Model directory is only a required argument for model constructors. plt = PLT("eurlex-model") # Fit the model on the training (observed) dataset. # The model weights and additional data will be stored in "eurlex-model" directory. # Features matrix X must be SciPy csr_matrix, NumPy array, or list of tuples of (idx, value), # while labels matrix Y should be list of lists or tuples containing positive labels. plt.fit(X_train, Y_train) # After the training model is not loaded to RAM. # You can preload the model to RAM to perform prediction. plt.load() # Predict five top labels for each data point in the test dataset using standard uniform-cost search Y_pred = plt.predict(X_test, top_k=5) # Calculate inverse propensity values (aka propensity scores) and predict with label weights inv_ps = inverse_propensity(Y_train, A=0.55, B=1.5) ps_Y_pred = plt.predict(X_test, labels_weights=inv_ps, top_k=5) # Evaluate the both predictions with propensity-scored and vanilla precision at 5 measure. print("Standard prediction:") print(" Precision at k:", precision_at_k(Y_test, Y_pred, k=5)) print(" Propensity-scored precision at k:", psprecision_at_k(Y_test, Y_pred, inv_ps, k=5)) print("Prediction weighted by inverse propensity:") print(" Precision at k:", precision_at_k(Y_test, ps_Y_pred, k=5)) print(" Propensity-scored precision at k:", psprecision_at_k(Y_test, ps_Y_pred, inv_ps, k=5))
X_train, Y_train = load_dataset("eurlex-4k", "train") X_test, Y_test = load_dataset("eurlex-4k", "test") # Using sklearn, lets split training dataset into dataset for training and tuning thresholds for macro F-measure. X_train, X_valid, Y_train, Y_valid = train_test_split(X_train, Y_train, test_size=0.2, random_state=0) # Create Probabilistic Labels Tree model and fit it on the training dataset. # The model weights and additional data will be stored in "eurlex-model" directory. plt = PLT("eurlex-model") plt.fit(X_train, Y_train) # First lets check macro F1 measure performance with const. threshold = 0.5 Y_pred_single_th = plt.predict(X_test, threshold=0.5) print("Micro F1 measure with const. threshold = 0.5:", f1_measure(Y_test, Y_pred_single_th, average='micro', zero_division=0)) print("Macro F1 measure with const. threshold = 0.5:", f1_measure(Y_test, Y_pred_single_th, average='macro', zero_division=0)) # Now lets use Online F measure optimization procedure to find better thresholds. # OFO can be used to find optimal threshold/thresholds for micro F1 measure and macro F1 measure. micro_ths = plt.ofo(X_valid, Y_valid, type="micro", a=1, b=2, epochs=5) macro_ths = plt.ofo(X_valid, Y_valid, type="macro", a=1, b=2, epochs=10) # Lets predict with the new thresholds and compare the results Y_pred_micro_ths = plt.predict(X_test, threshold=micro_ths) Y_pred_macro_ths = plt.predict(X_test, threshold=macro_ths) print("Micro F1 measure with thresholds from OFO procedure:",
# The beginning is the same as in the basic.py example. # Use load_dataset function to load one of the benchmark datasets # from XML Repository (http://manikvarma.org/downloads/XC/XMLRepository.html). X_train, Y_train = load_dataset("eurlex-4k", "train") X_test, Y_test = load_dataset("eurlex-4k", "test") # Create PLT model with "eurlex-model" directory, # it will be created and used during model training for storing weights. # napkinXC stores already trained parts of the models to save RAM. plt = PLT("eurlex-model") # Fit the model on the training dataset. # The model weights and additional data will be stored in "eurlex-model" directory. plt.fit(X_train, Y_train) # Predict. Y_pred = plt.predict(X_test, top_k=1) print("Precision at 1:", precision_at_k(Y_test, Y_pred, k=1)) # Delete plt object. del plt # To load the model, create a new PLT object with the same directory as the previous one. new_plt = PLT("eurlex-model") # Predict using a new model object. Y_pred = new_plt.predict(X_test, top_k=1) print("Precision at 1 after loading:", precision_at_k(Y_test, Y_pred, k=1))