Exemple #1
0
# train

training_features = np.column_stack(
    (den_training, eig_0_training, eig_1_training, eig_2_training, log_mass))
print(training_features.shape)

cv = True
third_features = int((training_features.shape[1] - 1) / 3)
param_grid = {
    "n_estimators": [1000, 1300, 1600],
    "max_features": [third_features, "sqrt", 25, 40],
    "min_samples_leaf": [5, 15],
    #"criterion": ["mse", "mae"],
}

clf = ml.MLAlgorithm(training_features,
                     method="regression",
                     cross_validation=cv,
                     split_data_method=None,
                     n_jobs=60,
                     save=True,
                     path=saving_path + "classifier/classifier.pkl",
                     param_grid=param_grid)
if cv is True:
    print(clf.best_estimator)
    print(clf.algorithm.best_params_)
    print(clf.algorithm.best_score_)

np.save(saving_path + "f_imp.npy", clf.feature_importances)
Exemple #2
0
features_training = np.load(
    "/home/lls/mlhalos_code/stored_files/50k_features.npy")
features_training = np.column_stack(
    (features_training[:, :-1], features_training[:,
                                                  -1], features_training[:,
                                                                         -1]))

features_test = np.load(
    "/home/lls/mlhalos_code/stored_files/features_test.npy")
features_test = np.column_stack(
    (features_test[:, :-1], features_test[:, -1], features_test[:, -1]))

# train algorithm

algo = ml.MLAlgorithm(features_training,
                      split_data_method=None,
                      cross_validation=False,
                      num_cv_folds=10,
                      n_jobs=22)

# predict probabilities

pred = algo.classifier.predict(features_test[:, :-1])
true = features_test[:, -1]

np.save(
    "/home/lls/mlhalos_code/stored_files/true_label_feature/predicted_probabilities.npy",
    pred)
np.save(
    "/home/lls/mlhalos_code/stored_files/true_label_feature/true_labels.npy",
    true)
Exemple #3
0
min_halo = 0
max_halo = 400
min_mass = ic_all.halo[max_halo]['mass'].sum()
max_mass = ic_all.halo[min_halo]['mass'].sum()
ic = parameters.InitialConditionsParameters(min_halo_number=min_halo,
                                            max_halo_number=max_halo,
                                            min_mass_scale=min_mass,
                                            max_mass_scale=max_mass)

feat_w_EPS = features.extract_labeled_features(initial_parameters=ic,
                                               add_EPS_label=True,
                                               n_samples=50000)

# Train the algorithm

algorithm_50k = ml.MLAlgorithm(features_training)

np.save('/Users/lls/Documents/CODE/stored_files/all_out/train_set_x.npy',
        algorithm_50k.X_train)
np.save('/Users/lls/Documents/CODE/stored_files/all_out/test_set_x.npy',
        algorithm_50k.X_test)
np.save('/Users/lls/Documents/CODE/stored_files/all_out/train_set_y.npy',
        algorithm_50k.y_train)
np.save('/Users/lls/Documents/CODE/stored_files/all_out/test_set_y.npy',
        algorithm_50k.y_true)

# Make predictions on all particles other than the training set

features_left = features_full_mass[~np.in1d(np.arange(len(features_full_mass)
                                                      ), index_training)]
predicted_probabilities = algorithm_50k.classifier.predict_proba(
Exemple #4
0
    testing_ind = np.load("/share/data1/lls/regression/50k_testing_ids.npy")
except IOError:
    print("Generating training/testing indices and saving")
    training_ind = np.random.choice(len(traj), 50000)
    np.save("/share/data1/lls/regression/50k_training_ids.npy", training_ind)

    testing_ind = np.arange(len(traj))[~np.in1d(range(len(traj)), training_ind)]
    np.save("/share/data1/lls/regression/50k_testing_ids.npy", testing_ind)

feat_training = np.column_stack((traj[training_ind], halo_mass[training_ind]))
X_test = traj[testing_ind]
y_test = halo_mass[testing_ind]
del traj
del halo_mass

cv = True
path_clf="/share/data1/lls/regression/CV/classifier/classifier.pkl"
clf = ml.MLAlgorithm(feat_training, method="regression", split_data_method=None, n_jobs=60, save=True,
                     cross_validation=cv, path=path_clf)

if cv is True:
    print(clf.best_estimator)
    print(clf.algorithm.best_params_)
    print(clf.algorithm.best_score_)
np.save("/share/data1/lls/regression/CV/f_imp.npy", clf.feature_importances)

y_predicted = clf.algorithm.predict(X_test)
np.save("/share/data1/lls/regression/CV/predicted_halo_mass.npy", y_predicted)
np.save("/share/data1/lls/regression/CV/true_halo_mass.npy", y_test)

            training_ind)
    testing_ind = np.arange(
        len(traj))[~np.in1d(range(len(traj)), training_ind)]
    np.save("/share/data1/lls/try_classifier/50k_testing_ids.npy",
            training_ind)

feat_training = np.column_stack(
    (traj[training_ind], true_labels[training_ind]))
# X_test = traj[testing_ind]
# y_test = halo_mass[testing_ind]
del traj
del true_labels

clf = ml.MLAlgorithm(
    feat_training,
    method="classification",
    split_data_method=None,
    n_jobs=60,
    save=True,
    path="/share/data1/lls/regression/try_classifier/classifier.pkl")

print(clf.best_estimator)
print(clf.algorithm.best_params_)
print(clf.algorithm.best_score_)
np.save("/share/data1/lls/regression/try_classifier/f_imp.npy",
        clf.feature_importances)

# y_predicted = clf.algorithm.predict(X_test)
# np.save("/share/data1/lls/regression/predicted_halo_mass.npy", y_predicted)
# np.save("/share/data1/lls/regression/true_halo_mass.npy", y_test)
import sys
sys.path.append("/home/lls/mlhalos_code/scripts")
import numpy as np
from mlhalos import machinelearning as ml

density_shear_features = np.load("/home/lls/stored_files/shear_and_density/density_shear_features.npy")

aucs = np.zeros((10, 2))

for i in range(10):
    index_training_i = np.random.choice(range(len(density_shear_features)), 100000)
    features = density_shear_features[index_training_i]

    trained_algo = ml.MLAlgorithm(features, split_data_method="train_test_split", train_size=50000, n_jobs=60)

    print(trained_algo.classifier.best_params_)

    auc_validation = trained_algo.classifier.best_score_
    auc_test = ml.get_auc_score(trained_algo.predicted_proba_test, trained_algo.true_label_test)

    aucs[i, 0] = auc_validation
    aucs[i, 1] = auc_test

np.save("/home/lls/stored_files/shear_and_density/aucs_val_test.npy", aucs)
    (den_features[training_ind], eig_0[training_ind],
     class_labels[training_ind]))
# X_test = np.column_stack((den_features[testing_ids], eig_0[testing_ids]))

cv = True
param_grid = {
    "n_estimators": [800, 1000, 1300],
    "max_features": ["auto", 0.4],
    "min_samples_leaf": [15, 5],
    "criterion": ["gini", "entropy"]
}

clf = ml.MLAlgorithm(feat_training,
                     method="classification",
                     cross_validation=cv,
                     split_data_method=None,
                     n_jobs=60,
                     save=True,
                     param_grid=param_grid,
                     path=saving_path + "classifier/classifier.pkl")
if cv is True:
    print(clf.best_estimator)
    print(clf.algorithm.best_params_)
    print(clf.algorithm.best_score_)
np.save(saving_path + "f_imp.npy", clf.feature_importances)

# classify

clf = joblib.load(saving_path + "classifier/classifier.pkl")
testing_ids = np.load(saving_path + "testing_ids.npy")
X_test = np.column_stack((den_features[testing_ids], eig_0[testing_ids]))
    density_shear_features = np.load(
        "/home/lls/stored_files/shear_and_density/density_shear_features.npy")
    index_training = np.load("/home/lls/stored_files/50k_features_index.npy")

    training_den_shear_features = density_shear_features[index_training, :]
    np.save(
        "/home/lls/stored_files/shear_and_density/training_density_shear_features.npy",
        training_den_shear_features)

# train only on density+prolateness

training_features = np.column_stack(
    (training_den_shear_features[:, :50], training_den_shear_features[:,
                                                                      100:]))

trained_algo = ml.MLAlgorithm(
    training_features,
    split_data_method=None,
    n_jobs=60,
    save=True,
    path=
    "/home/lls/stored_files/shear_and_density/den+prol/classifier/classifier.pkl"
)

print(trained_algo.classifier.best_score_)
print(trained_algo.classifier.best_estimator_)
print(trained_algo.classifier.best_params_)

np.save(
    "/home/lls/stored_files/shear_and_density/den+prol/feature_importances.npy",
    trained_algo.feature_importances)
# for i in range(50):
#     plt.hist(training_den_shear_features[np.where(training_den_shear_features[:,-1]==1)[0], i], label="in",
#              normed=True, histtype="step", bins=30)
#     plt.hist(training_den_shear_features[np.where(training_den_shear_features[:, -1] == -1)[0], i], label="out",
#              normed=True, histtype="step", bins=30)
#     plt.xlabel("feature " + str(i))
#     plt.legend(loc="best")
#     plt.savefig("/Users/lls/Documents/CODE/stored_files/all_out/distributions_50k/feature_" + str(i) +".pdf")
#     plt.clf()

# training_den_shear_features = np.column_stack((training_den_shear_features[:,50:100], training_den_shear_features[:,
#                                                                                       -1]))

trained_algo = ml.MLAlgorithm(
    training_den_shear_features,
    cross_validation=True,
    split_data_method=None,
    n_jobs=60,
    save=True,
    path=
    "/home/lls/stored_files/shear_and_density/full_eigenvalues/not_rescaled/"
    "classifier/classifier.pkl")

print(trained_algo.classifier.best_score_)
print(trained_algo.classifier.best_estimator_)
print(trained_algo.classifier.best_params_)

np.save(
    "/home/lls/stored_files/shear_and_density/full_eigenvalues/not_rescaled/feature_importances.npy",
    trained_algo.feature_importances)
Exemple #10
0
def train_algorithm(features):
    trained_algorithm = ml.MLAlgorithm(features)
    return trained_algorithm
"""
This should be done on hypatia.

"""
import numpy as np
from mlhalos import machinelearning as ml

# load training and test features with EPS label as feature

features_training = np.load("/home/lls/mlhalos_code/stored_files/with_EPS_label/50k_features_w_EPS_label.npy")
features_test = np.load("/home/lls/mlhalos_code/stored_files/with_EPS_label/features_w_EPS_test.npy")

# train algorithm

algo = ml.MLAlgorithm(features_training, split_data_method=None, num_cv_folds=10, n_jobs=22)


# predict probabilities

pred = algo.classifier.predict_proba(features_test[:, :-1])
true = features_test[:, -1]

np.save("/home/lls/mlhalos_code/stored_files/with_EPS_label/predicted_probabilities.npy", pred)
np.save("/home/lls/mlhalos_code/stored_files/with_EPS_label/true_labels.npy", true)


# save classifier details

f_imp = algo.classifier.best_estimator_.feature_importances_
np.save("/home/lls/mlhalos_code/stored_files/with_EPS_label/feature_importances.npy", f_imp)
Exemple #12
0
def find_predicted_and_true_labels(trained_algorithm, features):
    predicted_probabilities = trained_algorithm.algorithm.predict_proba(
        features[:, :-1])
    true_labels = features[:, -1]
    return predicted_probabilities, true_labels


######################## SCRIPT ########################

if __name__ == "__main__":

    # Train the algorithm

    features_training = load_features(features_type="training")
    trained_classifier = ml.MLAlgorithm(features_training)

    # Make predictions on all particles other than the training set

    test_features = load_features(features_type="test")
    predicted_probabilities, true_labels = find_predicted_and_true_labels(
        trained_classifier, test_features)

    np.save(
        '/Users/lls/Documents/CODE/stored_files/all_out/predicted_probabilities.npy',
        predicted_probabilities)
    np.save('/Users/lls/Documents/CODE/stored_files/all_out/true_labels.npy',
            true_labels)

# # Plot feature importance
#
traj = np.load(
    "/share/data1/lls/shear_quantities/quantities_id_ordered/density_trajectories.npy"
)

feat_training = np.column_stack((traj[training_ind], halo_mass[training_ind]))
X_test = traj[testing_ind]
del traj
del halo_mass

cv = False
clf = ml.MLAlgorithm(
    feat_training,
    method="regression",
    cross_validation=cv,
    split_data_method=None,
    n_jobs=60,
    save=True,
    path=
    "/share/data1/lls/regression/balanced_training_set/classifier/classifier.pkl"
)

if cv is True:
    print(clf.best_estimator)
    print(clf.algorithm.best_params_)
    print(clf.algorithm.best_score_)
np.save("/share/data1/lls/regression/balanced_training_set/f_imp.npy",
        clf.feature_importances)

# classify

y_predicted = clf.algorithm.predict(X_test)
Exemple #14
0
        features_all)

#### TRAINING #####
# Select 50,000 particles to use as training set

index_training = np.random.choice(len(features_all), size=50000, replace=False)
features_training = features_all[index_training]

np.save("/home/lls/stored_files/non_rescaled/features_training_index.npy",
        index_training)
np.save("/home/lls/stored_files/non_rescaled/50k_features_training.npy",
        features_training)

# Train the algorithm

RF = ml.MLAlgorithm(features_training, split_data_method=None, n_jobs=24)

#### PREDICT PROBABILITIES ON REMAINING PARTICLES IN THE BOX ######

features_left = features_all[~np.in1d(np.arange(len(features_all)
                                                ), index_training)]

predicted_probabilities = RF.classifier.predict_proba(features_left[:, :-1])
true_labels = features_left[:, -1]

np.save('/home/lls/stored_files/non_rescaled/pred_proba_features_left.npy',
        predicted_probabilities)
np.save('/home/lls/stored_files/non_rescaled/rue_labels_features_left.npy',
        true_labels)

#### EPS PREDICTIONS ON REMAINING PARTICLES IN THE BOX ######
Exemple #15
0
        training_features = np.column_stack(
            (den_training, eig_0_training, log_mass))
        print(training_features.shape)

        cv = True
        third_features = int((training_features.shape[1] - 1) / 3)
        param_grid = {
            "n_estimators": [1000, 1300],
            "max_features": [third_features, "sqrt", 5, 10],
            "min_samples_leaf": [5, 15],
            # "criterion": ["mse", "mae"],
        }

        clf = ml.MLAlgorithm(training_features,
                             method="regression",
                             cross_validation=cv,
                             split_data_method=None,
                             n_jobs=60,
                             param_grid=param_grid)
        if cv is True:
            print(clf.best_estimator)
            print(clf.algorithm.best_params_)
            print(clf.algorithm.best_score_)

        np.save(saving_path + "f_imp_" + str(i) + ".npy",
                clf.feature_importances)
        f_imp_all[i] = clf.feature_importances

    np.save(saving_path + "f_imp_all.npy", f_imp_all)
import sys
sys.path.append("/Users/lls/Documents/mlhalos_code/")
import numpy as np
from mlhalos import machinelearning as ml

traj = np.load("/Users/lls/Documents/mlhalos_files/stored_files/shear/shear_quantities/density_trajectories.npy")
halo_mass = np.load("/Users/lls/Documents/mlhalos_files/stored_files/halo_mass_particles.npy")

training_ind = np.random.choice(len(traj), 50000)

feat_training = np.column_stack((traj[training_ind], halo_mass[training_ind]))

clf = ml.MLAlgorithm(feat_training, method="regression", split_data_method=None)
print(clf.best_estimator)
print(clf.algorithm.best_params_)
print(clf.algorithm.best_score_)