Example #1
0
def plain_nn(data, order, theta_vec, val):
    """
    train a neural network that learns the entire dataset at once as a base to judge continous
    learning algorithms against
    """
    lambd = 1

    examples = np.array(data[0])
    labels = np.array(utils.make_labels(0, data[0].shape[0]))
    for i, _ in enumerate(data, start=0):
        if i != 0:
            y = order[i]
            m = data[y].shape[0]
            labels = np.append(labels, utils.make_labels(y, 200), axis=0)
            examples = np.append(examples, data[y][:200, :], axis=0)

    res = op.minimize(utils.cost,
                      theta_vec,
                      method='CG',
                      jac=True,
                      options=dict({
                          "disp": True,
                          "maxiter": 100,
                      }),
                      args=(m, lambd, labels, examples))

    validation_predict(val, utils.unravel_theta(res["x"]), 9)
Example #2
0
    def train(self, data, batch_size=250, num_epochs=25, eval_size=200):
        losses = []
        train, test = train_test_split(data)
        for epoch in range(num_epochs):
            for i in range(len(train) // batch_size):
                # ------------------
                # Train Disciminator
                # ------------------
                make_trainable(self.discriminator, True)
                # Get some real conformations from the train data
                real_confs = train[i * batch_size:(i + 1) * batch_size]
                real_confs = real_confs.reshape(-1, self.n_atoms, 3, 1)

                # Sample high dimensional noise and generate fake conformations
                noise = make_latent_samples(batch_size, self.noise_dim)
                fake_confs = self.generator.predict_on_batch(noise)

                # Label the conformations accordingly
                real_confs_labels, fake_confs_labels = make_labels(batch_size)

                self.discriminator.train_on_batch(real_confs,
                                                  real_confs_labels)
                self.discriminator.train_on_batch(fake_confs,
                                                  fake_confs_labels)

                # --------------------------------------------------
                #  Train Generator via GAN (swith off discriminator)
                # --------------------------------------------------
                noise = make_latent_samples(batch_size, self.noise_dim)
                make_trainable(self.discriminator, False)
                g_loss = self.gan.train_on_batch(noise, real_confs_labels)

            # Evaluate performance after epoch
            conf_eval_real = test[np.random.choice(len(test),
                                                   eval_size,
                                                   replace=False)]
            conf_eval_real = conf_eval_real.reshape(-1, self.n_atoms, 3, 1)
            noise = make_latent_samples(eval_size, self.noise_dim)
            conf_eval_fake = self.generator.predict_on_batch(noise)

            eval_real_labels, eval_fake_labels = make_labels(eval_size)

            d_loss_r = self.discriminator.test_on_batch(
                conf_eval_real, eval_real_labels)
            d_loss_f = self.discriminator.test_on_batch(
                conf_eval_fake, eval_fake_labels)
            d_loss = (d_loss_r + d_loss_f) / 2

            # we want the fake to be realistic!
            g_loss = self.gan.test_on_batch(noise, eval_real_labels)

            print(
                "Epoch: {:>3}/{} Discriminator Loss: {:>6.4f} Generator Loss: {:>6.4f}"
                .format(epoch + 1, num_epochs, d_loss, g_loss))

            losses.append((d_loss, g_loss))
        return losses
Example #3
0
def prep_data(data):

    data = normalize(data, x_min=0, x_max=1000)

    # Get frame shape for our data
    shape = data[0][0].shape

    # Our targets y are simply the shifted frames from X (i.e., each frames target is the frame 2 ahead of itself)
    X, Y = make_labels(data, shift_factor=2)

    # Shifting the data sometimes leaves us with days with no frames, remove these
    X = [dat for dat in X if dat.shape[0] != 0]
    Y = [dat for dat in Y if dat.shape[0] != 0]

    return shape, X, Y
Example #4
0
def theta_distance_reg(data, order, theta_vec, val):
    """
    train a NN with l2 regularization on thetas^l - thetas^l-1, which should place a higher cost
    when theta changes a more from the previous theta
    """
    lambd = 100
    for i, _ in enumerate(data, start=0):
        y = order[i]
        m = data[y].shape[0]
        labels = utils.make_labels(i, m)

        # If we are on the first loop we want regular l2 normalization because we want to
        # enforce that Thetas should be as low valued as possible. If it is not in loop one
        # then we can just do theta difference l2 normalization which is the goal
        if i == 0:
            res = op.minimize(utils.cost,
                              theta_vec,
                              method='CG',
                              jac=True,
                              options=dict({
                                  "disp": True,
                                  "maxiter": 50,
                              }),
                              args=(m, 1, labels, data[y]))
        else:
            res = op.minimize(utils.theta_diff_cost,
                              theta_vec,
                              method='CG',
                              jac=True,
                              options=dict({
                                  "disp": True,
                                  "maxiter": 50,
                              }),
                              args=(theta_vec.copy(), m, lambd, labels,
                                    data[y]))

        # set current theta vec for next loop complexity calc
        theta_vec = res["x"].copy()
        thetas = utils.unravel_theta(res["x"])

        validation_predict(val, thetas, i)
Example #5
0
 def class_histo(y_true, y_prob, bins, colors):
     h = np.full((len(bins) - 1, n_classes), 0.)
     from utils import make_labels
     class_labels = make_labels(sample, n_classes)
     for n in np.arange(n_classes):
         class_probs = y_prob[:, 0][class_labels == n]
         class_weights = len(class_probs) * [
             100 / len(y_true)
         ]  #len(class_probs)*[100/len(class_probs)]
         h[:,
           n] = pylab.hist(class_probs,
                           bins=bins,
                           label='class ' + str(n) + ': ' + label_dict[n],
                           histtype='step',
                           weights=class_weights,
                           log=True,
                           color=colors[n],
                           lw=2)[0]
     if n_classes == 2: colors = len(colors) * ['black']
     if True:
         for n in np.arange(1, n_classes):
             new_y_true = y_true[np.logical_or(y_true == 0,
                                               class_labels == n)]
             new_y_prob = y_prob[np.logical_or(y_true == 0,
                                               class_labels == n)]
             fpr, tpr, threshold = metrics.roc_curve(new_y_true,
                                                     new_y_prob[:, 0],
                                                     pos_label=0)
             axes.axvline(threshold[np.argmax(tpr - fpr)],
                          ymin=0,
                          ymax=1,
                          ls='--',
                          lw=1,
                          color=colors[n])
     for n in np.arange(1, n_classes):
         print_JSD(h[:, 0], h[:, n], n, colors[n], str(n))
     if n_classes > 2:
         print_JSD(h[:, 0], np.sum(h[:, 1:], axis=1), n_classes, 'black',
                   '\mathrm{bkg}')
Example #6
0
 def class_histo(y_true, y_prob, bins, colors):
     h = np.full((len(bins)-1,n_classes), 0.)
     from utils import make_labels
     class_labels = make_labels(sample, n_classes)
     for n in label_dict:
         class_probs   = y_prob[class_labels==n]
         class_weights = len(class_probs)*[100/len(y_true)] #len(class_probs)*[100/len(class_probs)]
         h[:,n] = pylab.hist(class_probs, bins=bins, label=label_dict[n], histtype='step',
                             weights=class_weights, log=True, color=colors[n], lw=2)[0]
     if n_classes == 2:
         colors = len(colors)*['black']
     if False:
         for n in set(label_dict)-set([0]):
             new_y_true = y_true[np.logical_or(y_true==0, class_labels==n)]
             new_y_prob = y_prob[np.logical_or(y_true==0, class_labels==n)]
             fpr, tpr, threshold = metrics.roc_curve(new_y_true, new_y_prob, pos_label=0)
             sig_ratio = np.sum(y_true==0)/len(new_y_true)
             max_index = np.argmax(sig_ratio*tpr + (1-fpr)*(1-sig_ratio))
             axes.axvline(threshold[max_index], ymin=0, ymax=1, ls='--', lw=1, color=colors[n])
     for n in set(label_dict)-set([0]):
         print_JSD(h[:,0], h[:,n], n, colors[n], str(n))
     if n_classes > 2:
         print_JSD(h[:,0], np.sum(h[:,1:], axis=1), n_classes, 'black', '\mathrm{bkg}')
Example #7
0
import featuretools as ft
import pandas as pd
import utils, os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

es = utils.load_entityset("./featuretools_part_1/")
print(es)
label_times = utils.make_labels(es=es,
                                product_name="Banana",
                                cutoff_time=pd.Timestamp('March 15, 2015'),
                                prediction_window=ft.Timedelta("4 weeks"),
                                training_window=ft.Timedelta("60 days"))

feature_matrix, features = ft.dfs(
    target_entity="users",
    cutoff_time=label_times,
    training_window=ft.Timedelta("60 days"),  # same as above
    entityset=es,
    verbose=True)

# Encode categorical values
fm_encoded, features_encoded = ft.encode_features(feature_matrix, features)

print("Number of features %s" % len(features_encoded))
print(features_encoded)

# Sample the feature by user input

# Train the classifier
Example #8
0
def main(users_from, users_till):
    # ### DEFINE THE PIPELINE PARAMETERS

    # In[2]:

    show_report = False
    save_model = True

    # the timeframe of extracted users

    # users_from = '2016-10-01'
    # users_till = '2017-09-30'
    cohort_size = 3000

    # the timeframe of extracted behavioral data
    interval = '3 weeks'

    # the type of the prediction problem
    # 'regression', 'binary classification', 'multiclass classification'
    prediction_problem_type = 'binary classification'

    # multiclass values
    medium_value = 5
    high_value = 50

    # number of the most important features to extract
    number_of_features = 20

    print("Pipeline parameters defined")

    # ### CONNECT TO THE DATABASE

    # In[3]:

    conn, cur = utils.connect_to_db()

    # ### BUILD ENTITY TABLES AND LABELS

    # #### Cohorts entity

    # In[4]:

    cohorts = utils_bux.build_cohorts_entity(cur=cur,
                                             users_from=users_from,
                                             users_till=users_till)

    # #### Users entity

    # In[5]:

    users = utils_bux.build_users_entity(cur=cur,
                                         users_from=users_from,
                                         users_till=users_till,
                                         interval=interval,
                                         cohorts=cohorts,
                                         cohort_size=cohort_size)

    # #### Transactions entity

    # In[6]:

    transactions = utils_bux.build_transactions_entity(cur=cur,
                                                       interval=interval)

    # #### Labels

    # In[7]:

    labels = utils_bux.build_target_values(cur=cur,
                                           medium_value=medium_value,
                                           high_value=high_value)

    # ### CREATE THE ENTITY SET

    # In[8]:

    es = utils_bux.create_bux_entity_set(cohorts, users, transactions)
    es

    # ### FEATURE ENGINEERING (DFS) FOR ALL FEATURES

    # In[9]:

    from featuretools.primitives import (Sum, Std, Max, Min, Mean, Count,
                                         PercentTrue, NUnique, Day, Week,
                                         Month, Weekday, Weekend)

    trans_primitives = [Day, Week, Month, Weekday, Weekend]
    agg_primitives = [Sum, Std, Max, Min, Mean, Count, PercentTrue, NUnique]

    fm_encoded, features_encoded = utils.calculate_feature_matrix(
        es,
        "users",
        trans_primitives=trans_primitives,
        agg_primitives=agg_primitives,
        max_depth=2)
    X = fm_encoded.reset_index().merge(labels)

    # ### TRAINING  ON ALL FEATURES

    # In[10]:

    # define the labels based on the prediction problem type
    X, y = utils.make_labels(X, prediction_problem_type)
    # split the data into training and testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    # train the model
    model = utils.rf_train(X_train, y_train, prediction_problem_type)
    # extract the most important features
    top_features = utils.feature_importances(model,
                                             features_encoded,
                                             n=number_of_features)
    # save the top features
    ft.save_features(top_features, "top_features")
    print("All features built and the most important features saved")

    # ### FEATURE ENGINEERING (DFS) FOR TOP FEATURES

    # In[11]:

    fm = utils.calculate_feature_matrix_top_features(es, top_features)
    X = fm.reset_index().merge(labels)
    print("Top features built")

    # ### TRAINING AND PREDICTION ON TOP FEATURES

    # In[12]:

    # define the labels based on the prediction problem type
    X, y = utils.make_labels(X, prediction_problem_type)
    # split the data into training and testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    # fit the model
    model = utils.rf_train(X_train, y_train, prediction_problem_type)
    print("Model trained on top features")

    # ### SAVE THE MODEL

    # In[13]:

    if save_model == True:
        joblib.dump(model, 'models/model.pkl')
        print("Model saved")
    else:
        print("Model not saved")

    # ### REPORT

    # In[ ]:

    if show_report:
        utils.show_report(model, X, y, X_train, y_train, X_test, y_test,
                          prediction_problem_type, top_features)