def svmClassifier(X, y, cv_size=5):
    """The following function is used to perform SVM for classification.
    @X This is the feature vector of type numpy
    @y labels of type numpy
    @ cv_size The k-fold size
    @return It returns SVC object and accuracy score value

    This function uses GridSearchCV to tune the parameters for the best performance
    The parameters are as follows:
    {
    'kernel': ('linear', 'rbf', 'poly', 'sigmoid'),
    'degree': [2,3,4,5,6]
    'C': [1,4,10],
    'gamma': ['auto', 'scale']
    }"""

    temp_cls_ = SVC()

    parameters = {
        'C': reciprocal(1, 100).rvs(5),
        'gamma': reciprocal(1, 100).rvs(5),
        'random_state': [0]
    }

    param_tuner_ = RandomizedSearchCV(temp_cls_,
                                      parameters,
                                      cv=cv_size,
                                      n_iter=16)
    param_tuner_.fit(X, y)
    cls = param_tuner_.best_estimator_.fit(X, y)
    return cls, param_tuner_.best_score_
Example #2
0
    def test_pdf(self):
        try:
            from scipy.stats import reciprocal
            from numpy.random import randint, uniform

            a = randint(1, 100)
            b = a + randint(1, 1000)
            d = dist(a, b)

            for _ in range(100):
                x = uniform(a, b)
                self.assertAlmostEqual(d.pdf(x), reciprocal(a, b).pdf(x))

        except ImportError:
            pass  # ok, no luck checking things with scipy...

        d = dist(a=10, b=5000)
        self.assertEqual(d.pdf(0), 0.0)
        self.assertEqual(d.pdf(6000), 0.0)

        self.assertNotEqual(d.pdf(d.a), 0.0)
        self.assertGreater(d.pdf(d.a), 0.0)

        self.assertNotEqual(d.pdf(d.b), 0.0)
        self.assertGreater(d.pdf(d.b), 0.0)
def exercise9():
    dataset = datasets.fetch_mldata('MNIST original')
    X = dataset['data']
    y = dataset['target']

    dv = 60000
    X_train, X_test = X[:dv], X[dv:]
    y_train, y_test = y[:dv], y[dv:]
    rnd_idx = np.random.permutation(dv)
    X_train, y_train = X_train[rnd_idx], y_train[rnd_idx]

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # svc = LinearSVC(multi_class='ovr')
    svc = SVC(decision_function_shape='ovr')
    param_distr = {'gamma': reciprocal(0.001, 0.1), 'C': uniform(1, 10)}
    search_cv = RandomizedSearchCV(svc,
                                   param_distributions=param_distr,
                                   n_iter=10)
    search_cv.fit(X_train_scaled[:1000], y_train[:1000])

    y_test_pred = search_cv.predict(X_test_scaled)
    print(accuracy_score(y_test, y_test_pred))
Example #4
0
def tuning():
    from scipy.stats import reciprocal
    from sklearn.model_selection import RandomizedSearchCV

    param_distribs = {
        "n_hidden": [0, 1, 2, 3],
        "n_neurons": np.arange(1, 100),
        "learning_rate": reciprocal(3e-4, 3e-2),
    }

    keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model2)

    rnd_search_cv = RandomizedSearchCV(keras_reg,
                                       param_distribs,
                                       n_iter=10,
                                       cv=3,
                                       verbose=2)
    rnd_search_cv.fit(X_train,
                      y_train,
                      epochs=100,
                      validation_data=(X_valid, y_valid),
                      callbacks=[keras.callbacks.EarlyStopping(patience=10)])

    print(rnd_search_cv.best_params_)
    rnd_search_cv.score(X_test, y_test)
    model3 = rnd_search_cv.best_estimator_.model
    print(model3.evaluate(X_test, y_test))


# tuning()
# model = keras.models.load_model("keras_seq_minst_model.h5") # rollback to best model
# print(model.evaluate(X_test, y_test))
Example #5
0
def load_params_nn(p, label, features):
    from scipy.stats import reciprocal
    features = features.count('|') + 1
    n_hidden = p[label]['n_hidden'] + 1
    n_neurons = p[label]['n_neurons'] + 1
    input_shape = p[label]['input_shape']
    dropout = p[label]['dropout']

    params = {
        'n_hidden': np.arange(1, n_hidden),
        'n_neurons': np.arange(1, n_neurons),
        'input_shape': [input_shape],
        'dropout': np.linspace(0, dropout, num=4),
        'features': [features]
    }

    if label != 'cnn':
        lr = p[label]['learning_rate']
        lr = [lr / 100, lr]
        params.update({'learning_rate': reciprocal(lr[0], lr[1])})

    if label == 'lstm' or label == 'gru':
        dropout_rec = p[label]['dropout_rec']
        params.update({'dropout_rec': np.linspace(0, dropout_rec, num=4)})

    if label == 'cnn':
        filters = p[label]['filters']
        pool_size = p[label]['pool_size']
        kernel_size = p[label]['kernel_size']
        params.update({'filters': filters})
        params.update({'kernel_size': kernel_size})
        params.update({'pool_size': [x for x in range(1, pool_size)]})
        params.update({'n_neurons': np.arange(64, n_neurons, 64)})

    return params
Example #6
0
    def _get_rand_space_vals(param_space):
        """
        Get user defined single hyperparameter and modify it to fit sklearn ParameterSampler input.
        :param param_space: (dict) user defined hyperparams space
        :return: (list/np.array) transformed user space definition to Distribution (scipy) or a list for categorical
        space
        """

        space = None

        if param_space['type'] == "static":
            space = [param_space['search_vals']]
        elif param_space['type'] == "categorical":
            space = param_space['search_vals']
        elif param_space['type'] == "normal":
            space = stats.norm(param_space['search_vals'][0],
                               param_space['search_vals'][1])
        elif param_space['type'] == "exp":
            space = stats.expon(param_space['search_vals'][0])
        elif param_space['type'] == "poisson":
            space = stats.poisson(param_space['search_vals'][0])
        elif param_space['type'] == "log-uniform":
            space = stats.reciprocal(param_space['search_vals'][0],
                                     param_space['search_vals'][1])
        elif param_space['type'] == "uniform":
            space = stats.uniform(param_space['search_vals'][0],
                                  param_space['search_vals'][1])
        elif param_space['type'] == "int-uniform":
            space = stats.randint(param_space['search_vals'][0],
                                  param_space['search_vals'][1])

        return space
Example #7
0
def train_with_svr_random_search(data_prepared, labels):
    """
    exercise 2
    :param data_prepared:
    :param labels:
    :return:
    """
    param_distribs = {
        'kernel': ['linear', 'rbf'],
        'C': reciprocal(20, 200000),
        'gamma': expon(scale=1.0)
    }

    svm_reg = SVR()
    random_search = RandomizedSearchCV(svm_reg,
                                       param_distributions=param_distribs,
                                       n_iter=50,
                                       cv=5,
                                       scoring='neg_mean_squared_error',
                                       verbose=2,
                                       n_jobs=4,
                                       random_state=42)
    random_search.fit(data_prepared, labels)
    mse = random_search.best_score_
    print("rmse: ", np.sqrt(-mse))
    print("best params: ", random_search.best_params_)
Example #8
0
    def __init__(self, name, params_dict):
        # Get component name
        self.name = name
        # Check that config dictionary as correct number of entries
        if self.npars != len(params_dict["values"].keys()):
            print(
                f"Component needs to have {self.npars} number of parameters.")
            sys.exit(1)

        if "latex_names" in params_dict:
            if (self.npars != len(params_dict["latex_names"])):
                print(f"Component latex names must have size {self.npars}")
                sys.exit(1)
            self.parameter_latex_names = params_dict["latex_names"]
        if "units" in params_dict:
            if (self.npars != len(params_dict["units"])):
                print(f"Component units must have size {self.npars}")
                sys.exit(1)
            self.parameter_units = params_dict["units"]

        # Setup the priors and init parameters
        self.prior = []
        params_values = params_dict["values"]
        for pname in self.parameter_names:
            if params_values[pname][0]:
                print("Fixing parameter values is not yet supported")
                sys.exit(1)
            if params_values[pname][2] == "uniform":
                self.prior.append(
                    uniform(params_values[pname][3],
                            params_values[pname][4] - params_values[pname][3]))
            elif params_values[pname][2] == "normal":
                self.prior.append(
                    norm(params_values[pname][3], params_values[pname][4]))
            elif params_values[pname][2] == "reciprocal":
                if params_values[pname][3] != 0:
                    print(
                        "Reciprocal prior cannot have a starting interval value of 0.0"
                    )
                    sys.exit(1)
                self.prior.append(
                    reciprocal(params_values[pname][3],
                               params_values[pname][4]))
            else:
                print(
                    f"Parameter prior distribution chosen not recognized: {params_values[pname][2]}"
                )
                sys.exit(1)
        self.prior = np.asarray(self.prior)
def exercise5_10():
    from sklearn.datasets import fetch_california_housing
    from sklearn.model_selection import train_test_split, RandomizedSearchCV
    from sklearn.preprocessing import StandardScaler
    from sklearn.svm import SVR
    from sklearn.metrics import mean_squared_error
    from scipy.stats import reciprocal, uniform

    housing = fetch_california_housing()
    X = housing["data"]
    y = housing["target"]

    train_x, test_x, train_y, test_y = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)

    scaler = StandardScaler()
    train_x_scaled = scaler.fit_transform(train_x)
    test_x_scaled = scaler.transform(test_x)

    rnd_param = {
        "gamma": reciprocal(0.001, 0.1),
        "C": uniform(1, 10)
    }
    rnd_search = RandomizedSearchCV(SVR(),
                                    rnd_param,
                                    cv=3,
                                    n_iter=10,
                                    verbose=2,
                                    random_state=42,
                                    n_jobs=-1)
    rnd_search.fit(train_x_scaled, train_y)

    best_estimator = rnd_search.best_estimator_
    print("Best estimator : \n{}".format(best_estimator))

    train_pred = best_estimator.predict(train_x_scaled)
    train_mse = mean_squared_error(train_y, train_pred)
    train_rmse = np.sqrt(train_mse)

    print("Train RMSE : {}".format(train_rmse))

    test_pred = best_estimator.predict(test_x_scaled)
    test_mse = mean_squared_error(test_y, test_pred)
    test_rmse = np.sqrt(test_mse)

    print("Test RMSE : {}".format(test_rmse))
Example #10
0
def random_search(model, cross_valid: int, iterations: int, random_state: int, X, y):
    param_distribs = {
        'n_estimators': stats.randint(low=1, high=200),
        'max_features': stats.randint(low=1, high=8),
        'C': stats.reciprocal(20, 200000),
        'gamma': stats.expon(scale=1.0),
        'alpha': stats.uniform()
    }
    r_search = RandomizedSearchCV(model, param_distributions=param_distribs, n_iter=iterations, cv=cross_valid,
                                  scoring='neg_mean_squared_error', random_state=random_state)
    r_search.fit(X, y)

    for mean_score, params in zip(r_search.cv_results_['mean_test_score'], r_search.cv_results_['params']):
        print(np.sqrt(-mean_score), params)

    return r_search.best_estimator_
Example #11
0
    def test_continous_induced_measure_ppf(self):
        degree = 2
        alpha_stat, beta_stat = 3, 3
        ab = jacobi_recurrence(
            degree+1, alpha=beta_stat-1, beta=alpha_stat-1, probability=True)

        tol = 1e-15
        var = stats.beta(alpha_stat, beta_stat, -5, 10)
        can_lb, can_ub = -1, 1
        lb, ub = var.support()
        print(lb, ub)
        cx = np.linspace(can_lb, can_ub, 51)

        def can_pdf(xx):
            loc, scale = lb+(ub-lb)/2, (ub-lb)/2
            return var.pdf(xx*scale+loc)*scale

        cdf_vals = continuous_induced_measure_cdf(
            can_pdf, ab, degree, can_lb, can_ub, tol, cx)
        assert np.all(cdf_vals <= 1.0)
        ppf_vals = continuous_induced_measure_ppf(
            var, ab, degree, cdf_vals, 1e-10, 1e-8)
        assert np.allclose(cx, ppf_vals)

        try:
            var = stats.loguniform(1.e-5, 1.e-3)
        except:
            var = stats.reciprocal(1.e-5, 1.e-3)
        ab = get_recursion_coefficients_from_variable(var, degree+5, {})
        can_lb, can_ub = -1, 1
        cx = np.linspace(can_lb, can_ub, 51)
        lb, ub = var.support()

        def can_pdf(xx):
            loc, scale = lb+(ub-lb)/2, (ub-lb)/2
            return var.pdf(xx*scale+loc)*scale
        cdf_vals = continuous_induced_measure_cdf(
            can_pdf, ab, degree, can_lb, can_ub, tol, cx)
        # differences caused by root finding optimization tolerance
        assert np.all(cdf_vals <= 1.0)
        ppf_vals = continuous_induced_measure_ppf(
            var, ab, degree, cdf_vals, 1e-10, 1e-8)
        # import matplotlib.pyplot as plt
        # plt.plot(cx, cdf_vals)
        # plt.plot(ppf_vals, cdf_vals, 'r*', ms=2)
        # plt.show()
        assert np.allclose(cx, ppf_vals)
def exercise10():
    dataset = datasets.fetch_california_housing()
    X = dataset['data']
    y = dataset['target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train.astype(np.float32))
    X_test_scaled = scaler.transform(X_test.astype(np.float32))

    svr = SVR()
    params = {'gamma': reciprocal(0.001, 0.1), 'C': uniform(1, 10)}
    search_cv = RandomizedSearchCV(svr, param_distributions=params, n_iter=10)
    search_cv.fit(X_train_scaled, y_train)

    y_test_pred = search_cv.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_test_pred)
    print('rmse', np.sqrt(mse))
    print('best SVR', search_cv.best_estimator_)
Example #13
0
class DBSCAN_Model(ModelRepresentationBase):

    klass = DBSCANWrapper
    category = StepCategories.Model

    type_of_variable = None  # TypeOfVariables.NUM

    # is_regression = False

    type_of_model = TypeOfProblem.CLUSTERING

    custom_hyper = {
        "eps": reciprocal(1e-5, 1),
        "metric": ["minkowski"],
        "leaf_size": sp_randint(10, 100),
        "min_samples": sp_randint(1, 100),
        "p": sp_randint(1, 20),
        "scale_eps": [True],
    }

    use_y = False
Example #14
0
def reciprocal_dstr_example():
    '''
    Shows why reciprocal is good for a uniform distribution of scales(logs)
    '''
    from scipy.stats import reciprocal
    import matplotlib.pyplot as plt

    distr = reciprocal(20, 200)
    samples = distr.rvs(10000, random_state=42)
    plt.figure(figsize=(10, 4))
    plt.subplot(121)
    plt.title('Reciprocal distribution (scale=1.0)')
    plt.hist(samples, bins=50)
    plt.subplot(122)
    plt.title('Log of this distribution')
    plt.hist(np.log(samples), bins=50)
    plt.show()

    samples = np.array(samples)
    range3_4 = np.sum(np.logical_and(samples < np.exp(4), samples > np.exp(3)))
    range4_5 = np.sum(np.logical_and(samples > np.exp(4), samples < np.exp(5)))
    print(range3_4, range4_5)
#
# ```
# $ tensorboard --logdir=tf_logs
# ```

# Now you can play around with the hyperparameters (e.g. the `batch_size` or the `learning_rate`) and run training again and again, comparing the learning curves. You can even automate this process by implementing grid search or randomized search. Below is a simple implementation of a randomized search on both the batch size and the learning rate. For the sake of simplicity, the checkpoint mechanism was removed.

# In[125]:

from scipy.stats import reciprocal

n_search_iterations = 10

for search_iteration in range(n_search_iterations):
    batch_size = np.random.randint(1, 100)
    learning_rate = reciprocal(0.0001, 0.1).rvs(random_state=search_iteration)

    n_inputs = 2 + 4
    logdir = log_dir("logreg")

    print("Iteration", search_iteration)
    print("  logdir:", logdir)
    print("  batch size:", batch_size)
    print("  learning_rate:", learning_rate)
    print("  training: ", end="")

    reset_graph()

    X = tf.placeholder(tf.float32, shape=(None, n_inputs + 1), name="X")
    y = tf.placeholder(tf.float32, shape=(None, 1), name="y")
Example #16
0
              callbacks=[keras.callbacks.EarlyStopping(patience=5)])
mse_test = keras_reg.score(X_test, y_test)
y_pred = keras_reg.predict(X_new)

#%%
np.random.seed(42)
tf.random.set_seed(42)

#%%
from scipy.stats import reciprocal
from sklearn.model_selection import RandomizedSearchCV

param_distribs = {
    "n_hidden": [0, 1, 2, 3],
    "n_neurons": np.arange(1, 100),
    "learning_rate": reciprocal(3e-4, 3e-2),
}

rnd_search_cv = RandomizedSearchCV(keras_reg,
                                   param_distribs,
                                   n_iter=10,
                                   cv=3,
                                   verbose=2)
rnd_search_cv.fit(X_train,
                  y_train,
                  epochs=100,
                  validation_data=(X_valid, y_valid),
                  callbacks=[keras.callbacks.EarlyStopping(patience=10)])

#%%
rnd_search_cv.best_params_
Example #17
0
y_pred = lin_svr.predict(X_train_scaled)
mse = mean_squared_error(y_train, y_pred)
mse

#RMSE

np.sqrt(mse)


#%% SVR with RBF kernel

from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

param_distributions = {'gamma': reciprocal(0.001, 0.1), 
                       'C': uniform(1, 10)}

rnd_search_cv = RandomizedSearchCV(SVR(), param_distributions, n_iter=10, verbose=2, cv=3, random_state=42)
rnd_search_cv.fit(X_train_scaled, y_train)

rnd_search_cv.best_score_
rnd_search_cv.best_estimator_

y_pred = rnd_search_cv.best_estimator_.predict(X_train_scaled)
mse = mean_squared_error(y_train, y_pred)
mse
np.sqrt(mse)

#%% Predict the test set
def run_random_cv_for_SVM(X_train,
                          y_train,
                          parameter_svm,
                          pipe_run,
                          scorers,
                          refit_scorer_name,
                          number_of_samples=400,
                          kfolds=5,
                          n_iter_search=2000,
                          plot_best=20):
    '''
    Execute random search cv

    :args:
        :X_train: feature dataframe X
        :y_train: ground truth dataframe y
        :parameter_svm: Variable parameter range for C and gamma
        :pipe_run:  Pipe to run
        :scorers: Scorers
        :refit_scorer_name: Refit scrorer name
        :number_of_samples: Number of samples to use from the training data. Default=400
        :kfolds: Number of folds for cross validation. Default=5
        :n_iter_search: Number of random search iterations. Default=2000
        :plot_best: Number of top results selected for narrowing the parameter range. Default=20

    :return:


    '''

    # Extract data subset to train on
    X_train_subset, y_train_subset = modelutil.extract_data_subset(
        X_train, y_train, number_of_samples)

    # Main set of parameters for the grid search run 2: Select solver parameter
    # Reciprocal for the logarithmic range
    params_run = {
        'model__C':
        reciprocal(parameter_svm.loc['param_model__C']['min'],
                   parameter_svm.loc['param_model__C']['max']),
        'model__gamma':
        reciprocal(parameter_svm.loc['param_model__gamma']['min'],
                   parameter_svm.loc['param_model__gamma']['max'])
    }

    # K-Fold settings
    skf = StratifiedKFold(n_splits=kfolds)

    # run randomized search
    random_search_run = RandomizedSearchCV(pipe_run,
                                           param_distributions=params_run,
                                           n_jobs=-1,
                                           n_iter=n_iter_search,
                                           cv=skf,
                                           scoring=scorers,
                                           refit=refit_scorer_name,
                                           return_train_score=True,
                                           verbose=5).fit(
                                               X_train_subset, y_train_subset)

    # random_search_run = RandomizedSearchCV(pipe_run, param_distributions=params_run, n_jobs=-1,
    #                                        n_iter=n_iter_search, cv=skf, scoring=scorers,
    #                                        refit=refit_scorer_name, return_train_score=True,
    #                                        iid=True, verbose=5).fit(X_train_subset, y_train_subset)

    print("Best parameters: ", random_search_run.best_params_)
    print("Best score: {:.3f}".format(random_search_run.best_score_))

    # Create the result table
    results = modelutil.generate_result_table(random_search_run, params_run,
                                              refit_scorer_name)

    # Get limits of the best values and focus in this area
    parameter_svm = generate_parameter_limits_for_SVM(results, plot_best)
    # display(parameter_svm)

    # Display results
    print(results.round(3).head(5))
    print(parameter_svm)

    return parameter_svm, results, random_search_run
Example #19
0
def main(args=None):
    from ligo.lw import lsctables
    from ligo.lw import utils as ligolw_utils
    from ligo.lw import ligolw
    import lal.series
    from scipy import stats

    p = parser()
    args = p.parse_args(args)

    xmldoc = ligolw.Document()
    xmlroot = xmldoc.appendChild(ligolw.LIGO_LW())
    process = register_to_xmldoc(xmldoc, p, args)

    gwcosmo = GWCosmo(
        cosmology.default_cosmology.get_cosmology_from_string(args.cosmology))

    ns_mass_min = 1.0
    ns_mass_max = 2.0
    bh_mass_min = 5.0
    bh_mass_max = 50.0

    ns_astro_spin_min = -0.05
    ns_astro_spin_max = +0.05
    ns_astro_mass_dist = stats.norm(1.33, 0.09)
    ns_astro_spin_dist = stats.uniform(ns_astro_spin_min,
                                       ns_astro_spin_max - ns_astro_spin_min)

    ns_broad_spin_min = -0.4
    ns_broad_spin_max = +0.4
    ns_broad_mass_dist = stats.uniform(ns_mass_min, ns_mass_max - ns_mass_min)
    ns_broad_spin_dist = stats.uniform(ns_broad_spin_min,
                                       ns_broad_spin_max - ns_broad_spin_min)

    bh_astro_spin_min = -0.99
    bh_astro_spin_max = +0.99
    bh_astro_mass_dist = stats.pareto(b=1.3)
    bh_astro_spin_dist = stats.uniform(bh_astro_spin_min,
                                       bh_astro_spin_max - bh_astro_spin_min)

    bh_broad_spin_min = -0.99
    bh_broad_spin_max = +0.99
    bh_broad_mass_dist = stats.reciprocal(bh_mass_min, bh_mass_max)
    bh_broad_spin_dist = stats.uniform(bh_broad_spin_min,
                                       bh_broad_spin_max - bh_broad_spin_min)

    if args.distribution.startswith('bns_'):
        m1_min = m2_min = ns_mass_min
        m1_max = m2_max = ns_mass_max
        if args.distribution.endswith('_astro'):
            x1_min = x2_min = ns_astro_spin_min
            x1_max = x2_max = ns_astro_spin_max
            m1_dist = m2_dist = ns_astro_mass_dist
            x1_dist = x2_dist = ns_astro_spin_dist
        elif args.distribution.endswith('_broad'):
            x1_min = x2_min = ns_broad_spin_min
            x1_max = x2_max = ns_broad_spin_max
            m1_dist = m2_dist = ns_broad_mass_dist
            x1_dist = x2_dist = ns_broad_spin_dist
        else:  # pragma: no cover
            assert_not_reached()
    elif args.distribution.startswith('nsbh_'):
        m1_min = bh_mass_min
        m1_max = bh_mass_max
        m2_min = ns_mass_min
        m2_max = ns_mass_max
        if args.distribution.endswith('_astro'):
            x1_min = bh_astro_spin_min
            x1_max = bh_astro_spin_max
            x2_min = ns_astro_spin_min
            x2_max = ns_astro_spin_max
            m1_dist = bh_astro_mass_dist
            m2_dist = ns_astro_mass_dist
            x1_dist = bh_astro_spin_dist
            x2_dist = ns_astro_spin_dist
        elif args.distribution.endswith('_broad'):
            x1_min = bh_broad_spin_min
            x1_max = bh_broad_spin_max
            x2_min = ns_broad_spin_min
            x2_max = ns_broad_spin_max
            m1_dist = bh_broad_mass_dist
            m2_dist = ns_broad_mass_dist
            x1_dist = bh_broad_spin_dist
            x2_dist = ns_broad_spin_dist
        else:  # pragma: no cover
            assert_not_reached()
    elif args.distribution.startswith('bbh_'):
        m1_min = m2_min = bh_mass_min
        m1_max = m2_max = bh_mass_max
        if args.distribution.endswith('_astro'):
            x1_min = x2_min = bh_astro_spin_min
            x1_max = x2_max = bh_astro_spin_max
            m1_dist = m2_dist = bh_astro_mass_dist
            x1_dist = x2_dist = bh_astro_spin_dist
        elif args.distribution.endswith('_broad'):
            x1_min = x2_min = bh_broad_spin_min
            x1_max = x2_max = bh_broad_spin_max
            m1_dist = m2_dist = bh_broad_mass_dist
            x1_dist = x2_dist = bh_broad_spin_dist
        else:  # pragma: no cover
            assert_not_reached()
    else:  # pragma: no cover
        assert_not_reached()

    dists = (m1_dist, m2_dist, x1_dist, x2_dist)

    # Read PSDs
    psds = list(
        lal.series.read_psd_xmldoc(
            ligolw_utils.load_fileobj(
                args.reference_psd,
                contenthandler=lal.series.PSDContentHandler)).values())

    # Construct mass1, mass2, spin1z, spin2z grid.
    m1 = np.geomspace(m1_min, m1_max, 10)
    m2 = np.geomspace(m2_min, m2_max, 10)
    x1 = np.linspace(x1_min, x1_max, 10)
    x2 = np.linspace(x2_min, x2_max, 10)
    params = m1, m2, x1, x2

    # Calculate the maximum distance on the grid.
    max_z = gwcosmo.get_max_z(psds,
                              args.waveform,
                              args.f_low,
                              args.min_snr,
                              m1,
                              m2,
                              x1,
                              x2,
                              jobs=args.jobs)
    if args.max_distance is not None:
        new_max_z = cosmology.z_at_value(gwcosmo.cosmo.luminosity_distance,
                                         args.max_distance * units.Mpc)
        max_z[max_z > new_max_z] = new_max_z
    max_distance = gwcosmo.sensitive_distance(max_z).to_value(units.Mpc)

    # Find piecewise constant approximate upper bound on distance.
    max_distance = cell_max(max_distance)

    # Calculate V * T in each grid cell
    cdfs = [dist.cdf(param) for param, dist in zip(params, dists)]
    cdf_los = [cdf[:-1] for cdf in cdfs]
    cdfs = [np.diff(cdf) for cdf in cdfs]
    probs = np.prod(np.meshgrid(*cdfs, indexing='ij'), axis=0)
    probs /= probs.sum()
    probs *= 4 / 3 * np.pi * max_distance**3
    volume = probs.sum()
    probs /= volume
    probs = probs.ravel()

    volumetric_rate = args.nsamples / volume * units.year**-1 * units.Mpc**-3

    # Draw random grid cells
    dist = stats.rv_discrete(values=(np.arange(len(probs)), probs))
    indices = np.unravel_index(dist.rvs(size=args.nsamples),
                               max_distance.shape)

    # Draw random intrinsic params from each cell
    cols = {}
    cols['mass1'], cols['mass2'], cols['spin1z'], cols['spin2z'] = [
        dist.ppf(stats.uniform(cdf_lo[i], cdf[i]).rvs(size=args.nsamples))
        for i, dist, cdf_lo, cdf in zip(indices, dists, cdf_los, cdfs)
    ]

    # Swap binary components as needed to ensure that mass1 >= mass2.
    # Note that the .copy() is important.
    # See https://github.com/numpy/numpy/issues/14428
    swap = cols['mass1'] < cols['mass2']
    cols['mass1'][swap], cols['mass2'][swap] = \
        cols['mass2'][swap].copy(), cols['mass1'][swap].copy()
    cols['spin1z'][swap], cols['spin2z'][swap] = \
        cols['spin2z'][swap].copy(), cols['spin1z'][swap].copy()

    # Draw random extrinsic parameters
    cols['distance'] = stats.powerlaw(
        a=3, scale=max_distance[indices]).rvs(size=args.nsamples)
    cols['longitude'] = stats.uniform(0, 2 * np.pi).rvs(size=args.nsamples)
    cols['latitude'] = np.arcsin(stats.uniform(-1, 2).rvs(size=args.nsamples))
    cols['inclination'] = np.arccos(
        stats.uniform(-1, 2).rvs(size=args.nsamples))
    cols['polarization'] = stats.uniform(0, 2 * np.pi).rvs(size=args.nsamples)
    cols['coa_phase'] = stats.uniform(-np.pi,
                                      2 * np.pi).rvs(size=args.nsamples)
    cols['time_geocent'] = stats.uniform(1e9, units.year.to(
        units.second)).rvs(size=args.nsamples)

    # Convert from sensitive distance to redshift and comoving distance.
    # FIXME: Replace this brute-force lookup table with a solver.
    z = np.linspace(0, max_z.max(), 10000)
    ds = gwcosmo.sensitive_distance(z).to_value(units.Mpc)
    dc = gwcosmo.cosmo.comoving_distance(z).to_value(units.Mpc)
    z_for_ds = interp1d(ds, z, kind='cubic', assume_sorted=True)
    dc_for_ds = interp1d(ds, dc, kind='cubic', assume_sorted=True)
    zp1 = 1 + z_for_ds(cols['distance'])
    cols['distance'] = dc_for_ds(cols['distance'])

    # Apply redshift factor to convert from comoving distance and source frame
    # masses to luminosity distance and observer frame masses.
    for key in ['distance', 'mass1', 'mass2']:
        cols[key] *= zp1

    # Populate sim_inspiral table
    sims = xmlroot.appendChild(lsctables.New(lsctables.SimInspiralTable))
    for row in zip(*cols.values()):
        sims.appendRow(**dict(dict.fromkeys(sims.validcolumns, None),
                              process_id=process.process_id,
                              simulation_id=sims.get_next_id(),
                              waveform=args.waveform,
                              f_lower=args.f_low,
                              **dict(zip(cols.keys(), row))))

    # Record process end time.
    process.comment = str(volumetric_rate)
    process.set_end_time_now()

    # Write output file.
    write_fileobj(xmldoc, args.output)
Example #20
0
def get_percentile_distr():
    """Get a distribution of percentiles geometrically 
    spaced between 50-100 and 5-50. Less in the middle, more
    at the ends."""
    second_part = (np.geomspace(5, 50, 10)).astype(int)
    first_part = (101. - np.geomspace(1, 51, 20)).astype(int)
    return np.hstack([first_part, second_part])


# define all the tunable params for each of them

LOGISTIC_TUNABLE = [{
    'classify__penalty':
    ['l1', 'l2'],  # l1 and l2 regularization, l1 introduces sparsity(lasso)
    'classify__C': stats.reciprocal(a=1e-1, b=1e4)
}]

SVM_TUNABLE = [{
    'classify__penalty': ['l2'],
    'classify__C': stats.reciprocal(a=1e-1, b=1e4),
}]

DECISION_TREE_TUNABLE = [{'classify__max_features': ['sqrt', 'log2']}]

RANDOM_FOREST_TUNABLE = [{'classify__max_features': ['sqrt', 'log2']}]

DEEP_TUNABLE = [{
    'classify__optimizer': ['adagrad', 'adam', 'rmsprop'],
    'classify__activation': ['relu', 'selu', 'sigmoid', 'tanh'],
    'classify__dropout':
x = np.linspace(reciprocal.ppf(0.01, a, b), reciprocal.ppf(0.99, a, b), 100)
ax.plot(x,
        reciprocal.pdf(x, a, b),
        'r-',
        lw=5,
        alpha=0.6,
        label='reciprocal pdf')

# Alternatively, the distribution object can be called (as a function)
# to fix the shape, location and scale parameters. This returns a "frozen"
# RV object holding the given parameters fixed.

# Freeze the distribution and display the frozen ``pdf``:

rv = reciprocal(a, b)
ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf')

# Check accuracy of ``cdf`` and ``ppf``:

vals = reciprocal.ppf([0.001, 0.5, 0.999], a, b)
np.allclose([0.001, 0.5, 0.999], reciprocal.cdf(vals, a, b))
# True

# Generate random numbers:

r = reciprocal.rvs(a, b, size=1000)

# And compare the histogram:

ax.hist(r, normed=True, histtype='stepfilled', alpha=0.2)
Example #22
0
plt.subplot(121)
plt.title("Exponential distribution (scale=1.0)")
plt.hist(samples, bins=50)

plt.subplot(122)
plt.title("Log of this distribution")
plt.hist(np.log(samples), bins=50)

plt.show()

#%% reciprocal continuous random variable

# use reciprocal distribution when you have no idea what the scale of the hyperparameter should be
# log of the samples roughly constant as scale of the samples picked from a uniform distribution

reciprocal_distrib = reciprocal(20, 200000)
samples = reciprocal_distrib.rvs(10000, random_state=42)

plt.figure(figsize=(10, 4))

plt.subplot(121)
plt.title("Reciprocal distribution (scale=1.0)")
plt.hist(samples, bins=50)

plt.subplot(122)
plt.title("Log of this distribution")
plt.hist(np.log(samples), bins=50)

plt.show()

#%% linear looking data
# data.
#
# Notes: some combinations of the hyper-parameters proposed above are invalid.
# You can make the parameter search accept such failures by setting `error_score`
# to `np.nan`. The warning messages give more details on which parameter
# combinations but the computation will proceed.
#
# Once the computation has completed, print the best combination of parameters
# stored in the `best_params_` attribute.

# %%
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal

param_distributions = {
    "logisticregression__C": reciprocal(0.001, 10),
    "logisticregression__solver": ["liblinear", "lbfgs"],
    "logisticregression__penalty": ["l2", "l1"],
    "columntransformer__cat-preprocessor__drop": [None, "first"]
}

model_random_search = RandomizedSearchCV(
    model,
    param_distributions=param_distributions,
    n_iter=20,
    error_score=np.nan,
    n_jobs=2,
    verbose=1)
model_random_search.fit(df_train, target_train)
model_random_search.best_params_
Example #24
0
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        stems.append(PorterStemmer().stem(item))
    return stems


def nlkt_tokenize(text):
    return nltk.word_tokenize(text)


pipe = Pipeline([('tfidf', TfidfVectorizer()), ('lsa', OptionalTruncatedSVD()),
                 ('clf', RandomForestClassifier())])

params = {
    "tfidf__ngram_range": [(1, 1), (1, 2), (2, 2)],
    "tfidf__min_df": stats.randint(1, 3),
    "tfidf__max_df": stats.uniform(.95, .3),
    "tfidf__sublinear_tf": [True, False],
    "tfidf__tokenizer": [None, stemmer, lemmatizer, nlkt_tokenize],
    "lsa__passthrough": [True, False, True, True, True, True, True],
    "lsa__n_components": stats.randint(100, 3000),
    'clf__n_estimators': stats.randint(100, 300),
    'clf__criterion': ['gini', 'entropy'],
    'clf__max_features': ['auto', 'log2', None],
    'clf__max_depth': stats.randint(10, 150),
    'clf__class_weight': [None, 'balanced'],
    'clf__min_samples_split': stats.reciprocal(.0001, .2),
    'clf__min_samples_leaf': stats.reciprocal(.0001, .2)
}
                default="accuracy",
                help="Scoring metric to be used")
args = vars(ap.parse_args())

# TODO: make grid to other classifiers like, GaussianNB and Logistic Regression
randomized_params = {
    "KNeighborsClassifier": {
        "n_neighbors": randint(low=1, high=30)
    },
    "RandomForest": {
        "n_estimators": randint(low=1, high=200),
        "max_features": randint(low=1, high=8),
    },
    "SVM": {
        "kernel": ["linear", "rbf"],
        "C": reciprocal(0.1, 200000),
        "gamma": expon(scale=1.0),
    },
}

models = {
    "KNeighborsClassifier": KNeighborsClassifier(),
    "RandomForest": RandomForestClassifier(),
    "SVM": SVC(),
}

if __name__ == "__main__":
    import tensorflow as tf

    model_name = args["model"]
    scoring = args["scoring"]
Example #26
0
if not os.path.exists(logdir):
    os.mkdir(logdir)
output_model_file = os.path.join(logdir,
                                 "model.h5")


sklearn_model = keras.wrappers.scikit_learn.KerasRegressor(
    build_fn = build_model)

from scipy.stats import reciprocal
# f(x) = 1/(x*log(b/a)) a <= x <= b

param_distribution = {
    "hidden_layers":[1, 2, 3, 4],
    "layer_size": np.arange(1, 100),
    "learning_rate": reciprocal(1e-4, 1e-2),
}

from sklearn.model_selection import RandomizedSearchCV
callbacks = [keras.callbacks.ModelCheckpoint(output_model_file,save_best_only = True),keras.callbacks.EarlyStopping(patience=5, min_delta=1e-2)]
random_search_cv = RandomizedSearchCV(sklearn_model,
                                      param_distribution,
                                      n_iter = 10,
                                      cv = 3,
                                      n_jobs = 1)
history = random_search_cv.fit(x_train_scaled, y_train, epochs = 30,
                     validation_data = (x_valid_scaled, y_valid),
                     callbacks = callbacks)

def plot_learning_curves(history):
    pd.DataFrame(history.history).plot(figsize=(8, 5))
Example #27
0
        logger.info(
            "Running SVM algorithm on best combination of features from FLAS subsets..."
        )
        features_flas = self.data.combinations_flas_top10[0][0]
        logger.info(f"These features are : {features_flas}")
        x_train = self.X_train[features_flas].values
        y_train = self.y_train.values
        x_test = self.X_test[features_flas].values
        y_test = self.y_test.values
        scaler = StandardScaler()
        x_train = scaler.fit_transform(x_train)
        x_test = scaler.transform(x_test)
        svm_flas = svm.SVC(kernel=kernel_choice)
        #Hyperparameter tuning
        param_distributions = {"gamma": reciprocal(0.001, 0.1), "C": uniform(1, 10)}
        rnd_search_cv_flas = RandomizedSearchCV(svm_flas, param_distributions, n_iter=10, verbose=2, cv=3)
        rnd_search_cv_flas.fit(X_train, y_train)
        rnd_search_cv_flas.best_estimator_
        
        logger.info("Fitting train data on SVM model...")
        rnd_search_cv_flas.best_estimator_.fit(x_train, y_train)
        y_pred_flas = rnd_search_cv_flas.best_estimator_.predict(x_test)
        logger.info(
            f"Train accuracy : {accuracy_score(y_train, svm_flas.predict(x_train))}"
        )
        logger.info(f"Test accuracy : {accuracy_score(y_test, y_pred_flas)}")

        logger.info(
            "Saving results for svm with best combination of features from FLAS subsets..."
        )
Example #28
0
y_pred = lin_clf.predict(X_train_scaled)
accuracy_score(y_train, y_pred)

#%% Train scaled data with SVC + RBF kernel

from sklearn.svm import SVC

svm_clf = SVC(gamma="scale")
svm_clf.fit(X_train_scaled[:10000], y_train[:10000])

#%% Narrow down hyperparameters with randomized search + CV

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

param_distributions = {'gamma': reciprocal(0.001, 0.1), 'C': uniform(1, 10)}

rnd_search_cv = RandomizedSearchCV(svm_clf,
                                   param_distributions,
                                   n_iter=10,
                                   verbose=2,
                                   cv=3)
rnd_search_cv.fit(X_train_scaled[:1000], y_train[:1000])

rnd_search_cv.best_score_
rnd_search_cv.best_estimator_.fit(X_train_scaled, y_train)

y_pred = rnd_search_cv.best_estimator_.predict(X_train_scaled)
accuracy_score(y_train, y_pred)

#%% And now the test set
svm_reg = SVR()
grid_search = GridSearchCV(svm_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error', verbose=2, n_jobs=4)
grid_search.fit(housing_prepared, housing_labels)
negative_mse = grid_search.best_score_
rmse = np.sqrt(-negative_mse)
rmse
print(grid_search.best_params_)

# exercise 2
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon, reciprocal
param_distribs = {
    'kernel': ['linear', 'rbf'],
    'C': reciprocal(20, 200000),
    'gamma': expon(scale=1.0),
}

svm_reg = SVR()
rnd_search = RandomizedSearchCV(svm_reg, param_distributions=param_distribs,
                                n_iter=50, cv=5, scoring='neg_mean_squared_error',
                                verbose=2, n_jobs=4, random_state=42)
rnd_search.fit(housing_prepared, housing_labels)
negative_mse = rnd_search.best_score_
rmse = np.sqrt(-negative_mse)
rmse
rnd_search.best_params_

# exercise 3
from sklearn.base import BaseEstimator, TransformerMixin
Example #30
0
    for layer in range(n_hidden):
        model.add(keras.layers.Dense(n_neurons, activation=activation))
    model.add(keras.layers.Dense(1))
    optimizer = keras.optimizers.Adam(lr=lr)
    model.compile(loss='mse', optimizer=optimizer)

    return model


from scipy.stats import reciprocal
from sklearn.model_selection import RandomizedSearchCV

param_distribs = {
    'n_hidden': [0, 1, 2, 3],
    'n_neurons': np.arange(1, 100),
    'lr': reciprocal(3e-4, 3e-2)  # 역수?
}

keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model)
rnd__search_cv = RandomizedSearchCV(keras_reg,
                                    param_distribs,
                                    n_iter=10,
                                    cv=None)

# fit
rnd__search_cv.fit(x_train,
                   y_train,
                   epochs=2,
                   validation_data=(x_valid, y_valid))

# evaluate, predict