Ejemplo n.º 1
0
def run_feature_selector_algo(args, S, X_train, X_test, T_train, T_test, i,
                              model_fpsr, model_fnsr, model_msfe, model_mspe,
                              model_card, model_nme_train, model_nme_test):
    log_params = False
    file_path_prefix = "./parameters/"
    feature_percentage = args.feature_percentage

    start_time = time.time()
    if args.algo == "RF":
        file_path = file_path_prefix + args.data + "/" + args.algo + "-" + str(
            i) + ".joblib"

        model = RandomForestRegressor(n_estimators=100)
        model = create_model(args, file_path, model, X_train, T_train)
        importance_vals = model.feature_importances_

        # Choose features which has 1% importance according to paper: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6660200/
        S_hat = np.argwhere(importance_vals > 0.01).flatten()

        if args.data == "MNIST" or args.data == "CIFAR-10":  # Take 40% features of MNIST only
            file_path = file_path_prefix + args.data + "/" + args.algo + "-" + feature_percentage + "_percent_features-" + str(
                i) + ".joblib"
            n_sub_feat_size = int(X_train.shape[1] * int(feature_percentage) /
                                  100)
            # For RF use this because of the already trained saved model in Sandipan's laptop
            # n_sub_feat_size = 315
            S_hat = np.argsort(
                importance_vals)[::-1][:n_sub_feat_size].flatten(
                )  #40% features
            model = RandomForestRegressor(n_estimators=100)
            model = create_model(args, file_path, model, X_train[:, S_hat],
                                 T_train)
            X_train = X_train[:, S_hat]
            X_test = X_test[:, S_hat]
        log_params = True

    elif args.algo == "DEEPLIFT":
        # Implemented using DeepExplain in SHAP: https://github.com/slundberg/shap
        #-------------------------------------------------------------------------#
        x_train = X_train
        x_test = X_test

        X_train = X_train.reshape(X_train.shape[0], 28, 28)
        X_test = X_test.reshape(X_test.shape[0], 28, 28)
        # Make sure images have shape (28, 28, 1)
        X_train = np.expand_dims(X_train, -1)
        X_test = np.expand_dims(X_test, -1)
        print("X_train shape:", X_train.shape)
        print(X_train.shape[0], "train samples")
        print(X_test.shape[0], "test samples")

        # Model / data parameters
        num_classes = 10
        input_shape = (28, 28, 1)
        """
        ## Build the model
        """

        model = CNNModel(num_classes, input_shape).create_cnn_model()
        model.summary()

        file_path = file_path_prefix + args.data + "/" + args.algo + "-" + str(
            i) + ".h5"
        """
        ## Train the model
        """

        batch_size = 128
        epochs = 15

        model.compile(loss="categorical_crossentropy",
                      optimizer="adam",
                      metrics=["accuracy"])
        model = create_model(args, file_path, model, X_train, T_train)

        # Sanity checks
        score_train = model.evaluate(X_train, T_train, verbose=0)
        score_test = model.evaluate(X_test, T_test, verbose=0)
        print("Test loss:", score_test[0])
        print("Test accuracy:", score_test[1])

        background = X_train[np.random.choice(X_train.shape[0],
                                              100,
                                              replace=False)]
        # explain predictions of the model on 10 images
        e = shap.DeepExplainer(model, background)

        x_test_sample = X_test[np.random.choice(
            X_test.shape[0], int(args.deeplift_sample_size), replace=False), :]

        shap_values = e.shap_values(x_test_sample)

        total_val = np.sum(np.sum(np.abs(shap_values), axis=0),
                           axis=0).flatten()
        S_hat = total_val.argsort()[::-1]

        if args.data == "MNIST" or args.data == "CIFAR-10":  # Take 40% features of MNIST only
            X_train = x_train[:, S_hat]
            X_test = x_test[:, S_hat]
            X_train = X_train.reshape(X_train.shape[0], 28, 28)
            X_test = X_test.reshape(X_test.shape[0], 28, 28)
            # Make sure images have shape (28, 28, 1)
            X_train = np.expand_dims(X_train, -1)
            X_test = np.expand_dims(X_test, -1)
            file_path = file_path_prefix + args.data + "/" + args.algo + "-" + feature_percentage + "percent_features-" + str(
                i) + ".h5"
            n_sub_feat_size = int(X_train.shape[1] * int(feature_percentage) /
                                  100)
            S_hat = total_val.argsort()[::-1][:n_sub_feat_size]  #40% features
            model_new = CNNModel(num_classes, input_shape).create_cnn_model()
            model_new.compile(loss="categorical_crossentropy",
                              optimizer="adam",
                              metrics=["accuracy"])
            model = create_model(args, file_path, model_new, X_train, T_train)

        # Just to compare what global features SHAP with DeepLift choose
        # X_train_ori =  loadmat("./mat_files/MNIST.mat")["train_x"].astype(np.float32)
        # show_image([X_train_ori[:,1],X_train_ori[:,20],X_train_ori[:,30]],S_hat[0:len(S)], (args.algo+str(i)))

        # show_image(x_train[1,:].flatten(),x_train[20,:].flatten(),x_train[30,:].flatten(),S_hat, (args.algo+str(i)))

        log_params = True

    elif args.algo == "BART":
        # Implemented using XBART: https://github.com/JingyuHe/XBART
        #----------------------------------------------------------#
        x_train = X_train
        x_test = X_test

        X_train = pd.DataFrame(X_train)
        X_test = pd.DataFrame(X_test)

        # Ugly hack otherwise xbart fit does not work
        T_train = T_train.flatten()
        T_test = T_test.flatten()

        file_path = file_path_prefix + args.data + "/" + args.algo + str(
            args.tree_size) + "-" + str(i) + ".joblib"
        # model = XBART(num_trees = int(args.tree_size), num_sweeps = 20, burnin = 15, verbose = True, parallel = True)
        model = XBART(num_trees=int(args.tree_size),
                      num_sweeps=20,
                      burnin=15,
                      verbose=True,
                      parallel=True)
        model = create_model(args, file_path, model, X_train, T_train)

        S_hat = sorted(model.importance, key=model.importance.get)[::-1]
        imp_vals = np.array(S_hat)
        S_hat = imp_vals[imp_vals > 0.01]

        if args.data == "MNIST" or args.data == "CIFAR-10":  # Take 40% features of MNIST only
            file_path = file_path_prefix + args.data + "/" + args.algo + "-" + feature_percentage + "_percent_features-" + str(
                i) + ".joblib"
            n_sub_feat_size = int(X_train.shape[1] * int(feature_percentage) /
                                  100)
            S_hat = sorted(
                model.importance,
                key=model.importance.get)[::-1][:
                                                n_sub_feat_size]  #40% features
            model = XBART(num_trees=int(args.tree_size),
                          num_sweeps=20,
                          burnin=15,
                          verbose=True,
                          parallel=True)
            X_train = pd.DataFrame(x_train[:, S_hat])
            X_test = pd.DataFrame(x_test[:, S_hat])
            model = create_model(args, file_path, model, X_train, T_train)

        # Ugly hack otherwise xbart predict does not work
        T_train = T_train.reshape(X_train.shape[0], 1)
        T_test = T_test.reshape(X_test.shape[0], 1)

        log_params = True

    elif args.algo == "POINTNET":
        import torch
        from torch.utils.data import DataLoader
        import kaolin as kal
        from kaolin import ClassificationEngine
        from kaolin.datasets import ModelNet
        from kaolin.models.PointNet import PointNetClassifier as PointNet
        import kaolin.transforms as tfs

        modelnet_path = './mat_files/ModelNet10'
        categories = ['chair', 'sofa']
        num_points = 1024
        device = 'cuda'

        transform = tfs.Compose([
            tfs.TriangleMeshToPointCloud(num_samples=num_points),
            tfs.NormalizePointCloud()
        ])

        train_loader = DataLoader(ModelNet(modelnet_path,
                                           categories=categories,
                                           split='train',
                                           transform=transform,
                                           device=device),
                                  batch_size=12,
                                  shuffle=True)

    elif args.algo == "GAM":  # Note GAM doesn't work on MNIST properly
        file_path = file_path_prefix + args.data + "/" + args.algo + "-" + str(
            i) + ".joblib"
        thershold = 0.01

        gam_fn_form = s(0, n_splines=5)
        for feature in range(1, X_train.shape[1]):
            gam_fn_form += s(feature, n_splines=5)
        # Regression in GAM
        # https://pygam.readthedocs.io/en/latest/notebooks/tour_of_pygam.html#Regression
        model = GAM(gam_fn_form,
                    distribution='normal',
                    link='identity',
                    max_iter=10,
                    tol=0.001)
        model = create_model(args, file_path, model, X_train, T_train)

        feature_vals = np.array(model.statistics_['p_values'])
        imp_vals = feature_vals[feature_vals > thershold]
        S_hat = np.argsort(imp_vals).flatten()

        #S_hat = np.argsort(model.statistics_['p_values'])

        log_params = True

    elif args.algo == "LASSO":
        file_path = file_path_prefix + args.data + "/" + args.algo + "-" + str(
            i) + ".joblib"

        thershold = 0.01
        #T_train = np.argmax(T_train, axis=1)
        #T_test = np.argmax(T_test, axis=1)

        model = linear_model.Lasso(alpha=0.01, max_iter=5000)
        model = create_model(args, file_path, model, X_train, T_train)

        imp_vals = model.coef_[model.coef_ > thershold]
        S_hat = np.argsort(imp_vals).flatten()
        if args.data == "MNIST" or args.data == "CIFAR-10":  # Take 40% features of MNIST only
            file_path = file_path_prefix + args.data + "/" + args.algo + "-" + feature_percentage + "_percent_features-" + str(
                i) + ".joblib"
            n_sub_feat_size = int(X_train.shape[1] * int(feature_percentage) /
                                  100)
            S_hat = np.argsort(
                model.coef_)[::-1][:n_sub_feat_size].flatten()  #40% features
            model = linear_model.Lasso(alpha=0.01, max_iter=5000)
            model = create_model(args, file_path, model, X_train[:, S_hat],
                                 T_train)
            X_train = X_train[:, S_hat]
            X_test = X_test[:, S_hat]

        # Ugly hack otherwise vector norm not calculated
        #T_train = T_train.reshape(X_train.shape[0], 1)
        #T_test = T_test.reshape(X_test.shape[0], 1)

        log_params = True

    elif args.algo == "E-NET":
        file_path = file_path_prefix + args.data + "/" + args.algo + "-" + str(
            i) + ".joblib"

        T_train = np.argmax(T_train, axis=1)
        T_test = np.argmax(T_test, axis=1)

        model = ElasticNet(alpha=0.01, l1_ratio=0.7)
        model = create_model(args, file_path, model, X_train, T_train)

        S_hat = np.argsort(model.coef_)

        log_params = False

    elif args.algo == "CORR":
        thershold = 0.01
        importance_vals = abs(np.dot((X_train.T), T_train).T)[::-1]
        S_hat = np.argsort(importance_vals > thershold).flatten()
        model_fpsr[0, i] = FPSR(S, S_hat)
        model_fnsr[0, i] = FNSR(S, S_hat)

        log_params = False
    elif args.algo == "SPINN":
        # https://github.com/jjfeng/spinn
        log_params = False
        print("Not yet implemented!")

    else:
        print("Sorry! No such evaluation exists.")

    if log_params:
        # Mean squared errors
        model_msfe[0, i] = compute_mse_compare(
            model.predict(X_train).reshape(T_train.shape), T_train)
        model_mspe[0, i] = compute_mse_compare(
            model.predict(X_test).reshape(T_test.shape), T_test)
        # Selection rate errors
        model_fpsr[0, i] = FPSR(S, S_hat)
        model_fnsr[0, i] = FNSR(S, S_hat)
        # Cardinality of the model
        model_card[0, i] = len(S_hat)
        # Normalized Error (NME)
        model_nme_train[0, i] = compute_nme(
            model.predict(X_train).reshape(T_train.shape), T_train)
        model_nme_test[0, i] = compute_nme(
            model.predict(X_test).reshape(T_test.shape), T_test)

        if args.algo == "BART":
            val = model.predict(X_train)
            normalized = (val - min(val)) / (max(val) - min(val))
            accuracy = np.sum([
                abs(0.9 * normalized - T_train.flatten()) < 0.2
            ]) / len(T_train.flatten())
            print("**********The train accuracy is: ", accuracy)
        else:
            print(
                "**********The train accuracy is: ",
                calculate_accuracy(
                    model.predict(X_train).reshape(T_train.shape).T,
                    T_train.T))

        if args.algo == "BART":
            val = model.predict(X_test)
            normalized = (val - min(val)) / (max(val) - min(val))
            accuracy = np.sum([abs(0.9 * normalized - T_test.flatten()) < 0.2
                               ]) / len(T_test.flatten())
            print("**********The test accuracy is: ", accuracy)
        else:
            print(
                "**********The test accuracy is: ",
                calculate_accuracy(
                    model.predict(X_test).reshape(T_test.shape).T, T_test.T))

    print("Time taken for this MC iteration: ", time.time() - start_time)
    optimizer = tf.keras.optimizers.RMSprop(0.001)

    model.compile(loss='mse', optimizer=optimizer, metrics=['mae', 'mse'])
    return model


model = build_model()

#check
example_batch = x[:10]
example_result = model.predict(example_batch)
example_result

#training the model
EPOCHS = 10

history = model.fit(
    x,
    y,
    epochs=EPOCHS,
    validation_split=0.2,
    verbose=0,
)
#loss, mae, mse
model.evaluate(xtest, ytest, verbose=2)

ypreds = model.predict(xtest)
# mean squared error
mean_squared_error(ytest, ypreds)
Ejemplo n.º 3
0
    plt.legend()
    plt.show()

    return model


model = run_ann(train1[:, :10], train1[:, -1])

# <br><br><br>
#
# ### **_Results of Artificial Neural Network_**

# In[42]:

model.evaluate(train1[:, :10], train1[:, -1])
model.evaluate(test1[:, :10], test1[:, -1])

# <br>
#
# ##### In this case too, the test MSE is similar to the training MSE, so the model generalizes well.
# <br><br>
# ##### The performance of the Neural Network is quite similar to that of Linear Regression indicating that there is not much non-linearity in the data.

# <br><br>
#
# ## _<u>Propose enhancements to the model, what would you do if you had more time?</u>_
#
# ### 1. Feature Engineering
# * Most of the features used in these models we picked up directly from the existing dataset with minimal processing.
# * Creation of new features and gauging their predictive power would be an important step to improve model performance.
Ejemplo n.º 4
0


predict_columns = ['elo', 'elo_recent', 'elo_surf', 'prob_g', 'prob_g_rec', 'lose12', 'p_gamma', 'p_gamma_rec', 'p_gamma_surf', 'p_gamma_time', 'set_score', 'match_score', 'p_gamma_rec_p5', 'p_gamma_rec_m5', 'd_dif', 'freq_home', 'freq_away', 'fatigue_home', 'fatigue_away', 'win_perc', 'set_perc', 'game_perc', '1st_lose_win', '1st_win_lose', 'p_gamma_simple', 'p_gamma_simplest', 'p_gamma_simple_surf', 'p_gamma_simplest_surf', 'age_dif']
model = Sequential()

model.add(Dense(32, input_dim=29, activation='relu'))
model.add(Dense(32, activation='relu'))

model.add(Dense(1, activation='sigmoid'))
# Compile model
model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])

model.fit(x_train[predict_columns].values, y_train.values, epochs = 400, batch_size = 10)

scores = model.evaluate(x_train[predict_columns], y_train)

y_pred_keras = model.predict_proba(x_test[predict_columns])

print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

x_test['prediction_keras'] = y_pred_keras

x_test.to_csv('x_test.csv', sep = ';', decimal=",")

model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")