def plain_nn(data, order, theta_vec, val): """ train a neural network that learns the entire dataset at once as a base to judge continous learning algorithms against """ lambd = 1 examples = np.array(data[0]) labels = np.array(utils.make_labels(0, data[0].shape[0])) for i, _ in enumerate(data, start=0): if i != 0: y = order[i] m = data[y].shape[0] labels = np.append(labels, utils.make_labels(y, 200), axis=0) examples = np.append(examples, data[y][:200, :], axis=0) res = op.minimize(utils.cost, theta_vec, method='CG', jac=True, options=dict({ "disp": True, "maxiter": 100, }), args=(m, lambd, labels, examples)) validation_predict(val, utils.unravel_theta(res["x"]), 9)
def train(self, data, batch_size=250, num_epochs=25, eval_size=200): losses = [] train, test = train_test_split(data) for epoch in range(num_epochs): for i in range(len(train) // batch_size): # ------------------ # Train Disciminator # ------------------ make_trainable(self.discriminator, True) # Get some real conformations from the train data real_confs = train[i * batch_size:(i + 1) * batch_size] real_confs = real_confs.reshape(-1, self.n_atoms, 3, 1) # Sample high dimensional noise and generate fake conformations noise = make_latent_samples(batch_size, self.noise_dim) fake_confs = self.generator.predict_on_batch(noise) # Label the conformations accordingly real_confs_labels, fake_confs_labels = make_labels(batch_size) self.discriminator.train_on_batch(real_confs, real_confs_labels) self.discriminator.train_on_batch(fake_confs, fake_confs_labels) # -------------------------------------------------- # Train Generator via GAN (swith off discriminator) # -------------------------------------------------- noise = make_latent_samples(batch_size, self.noise_dim) make_trainable(self.discriminator, False) g_loss = self.gan.train_on_batch(noise, real_confs_labels) # Evaluate performance after epoch conf_eval_real = test[np.random.choice(len(test), eval_size, replace=False)] conf_eval_real = conf_eval_real.reshape(-1, self.n_atoms, 3, 1) noise = make_latent_samples(eval_size, self.noise_dim) conf_eval_fake = self.generator.predict_on_batch(noise) eval_real_labels, eval_fake_labels = make_labels(eval_size) d_loss_r = self.discriminator.test_on_batch( conf_eval_real, eval_real_labels) d_loss_f = self.discriminator.test_on_batch( conf_eval_fake, eval_fake_labels) d_loss = (d_loss_r + d_loss_f) / 2 # we want the fake to be realistic! g_loss = self.gan.test_on_batch(noise, eval_real_labels) print( "Epoch: {:>3}/{} Discriminator Loss: {:>6.4f} Generator Loss: {:>6.4f}" .format(epoch + 1, num_epochs, d_loss, g_loss)) losses.append((d_loss, g_loss)) return losses
def prep_data(data): data = normalize(data, x_min=0, x_max=1000) # Get frame shape for our data shape = data[0][0].shape # Our targets y are simply the shifted frames from X (i.e., each frames target is the frame 2 ahead of itself) X, Y = make_labels(data, shift_factor=2) # Shifting the data sometimes leaves us with days with no frames, remove these X = [dat for dat in X if dat.shape[0] != 0] Y = [dat for dat in Y if dat.shape[0] != 0] return shape, X, Y
def theta_distance_reg(data, order, theta_vec, val): """ train a NN with l2 regularization on thetas^l - thetas^l-1, which should place a higher cost when theta changes a more from the previous theta """ lambd = 100 for i, _ in enumerate(data, start=0): y = order[i] m = data[y].shape[0] labels = utils.make_labels(i, m) # If we are on the first loop we want regular l2 normalization because we want to # enforce that Thetas should be as low valued as possible. If it is not in loop one # then we can just do theta difference l2 normalization which is the goal if i == 0: res = op.minimize(utils.cost, theta_vec, method='CG', jac=True, options=dict({ "disp": True, "maxiter": 50, }), args=(m, 1, labels, data[y])) else: res = op.minimize(utils.theta_diff_cost, theta_vec, method='CG', jac=True, options=dict({ "disp": True, "maxiter": 50, }), args=(theta_vec.copy(), m, lambd, labels, data[y])) # set current theta vec for next loop complexity calc theta_vec = res["x"].copy() thetas = utils.unravel_theta(res["x"]) validation_predict(val, thetas, i)
def class_histo(y_true, y_prob, bins, colors): h = np.full((len(bins) - 1, n_classes), 0.) from utils import make_labels class_labels = make_labels(sample, n_classes) for n in np.arange(n_classes): class_probs = y_prob[:, 0][class_labels == n] class_weights = len(class_probs) * [ 100 / len(y_true) ] #len(class_probs)*[100/len(class_probs)] h[:, n] = pylab.hist(class_probs, bins=bins, label='class ' + str(n) + ': ' + label_dict[n], histtype='step', weights=class_weights, log=True, color=colors[n], lw=2)[0] if n_classes == 2: colors = len(colors) * ['black'] if True: for n in np.arange(1, n_classes): new_y_true = y_true[np.logical_or(y_true == 0, class_labels == n)] new_y_prob = y_prob[np.logical_or(y_true == 0, class_labels == n)] fpr, tpr, threshold = metrics.roc_curve(new_y_true, new_y_prob[:, 0], pos_label=0) axes.axvline(threshold[np.argmax(tpr - fpr)], ymin=0, ymax=1, ls='--', lw=1, color=colors[n]) for n in np.arange(1, n_classes): print_JSD(h[:, 0], h[:, n], n, colors[n], str(n)) if n_classes > 2: print_JSD(h[:, 0], np.sum(h[:, 1:], axis=1), n_classes, 'black', '\mathrm{bkg}')
def class_histo(y_true, y_prob, bins, colors): h = np.full((len(bins)-1,n_classes), 0.) from utils import make_labels class_labels = make_labels(sample, n_classes) for n in label_dict: class_probs = y_prob[class_labels==n] class_weights = len(class_probs)*[100/len(y_true)] #len(class_probs)*[100/len(class_probs)] h[:,n] = pylab.hist(class_probs, bins=bins, label=label_dict[n], histtype='step', weights=class_weights, log=True, color=colors[n], lw=2)[0] if n_classes == 2: colors = len(colors)*['black'] if False: for n in set(label_dict)-set([0]): new_y_true = y_true[np.logical_or(y_true==0, class_labels==n)] new_y_prob = y_prob[np.logical_or(y_true==0, class_labels==n)] fpr, tpr, threshold = metrics.roc_curve(new_y_true, new_y_prob, pos_label=0) sig_ratio = np.sum(y_true==0)/len(new_y_true) max_index = np.argmax(sig_ratio*tpr + (1-fpr)*(1-sig_ratio)) axes.axvline(threshold[max_index], ymin=0, ymax=1, ls='--', lw=1, color=colors[n]) for n in set(label_dict)-set([0]): print_JSD(h[:,0], h[:,n], n, colors[n], str(n)) if n_classes > 2: print_JSD(h[:,0], np.sum(h[:,1:], axis=1), n_classes, 'black', '\mathrm{bkg}')
import featuretools as ft import pandas as pd import utils, os from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import cross_val_score es = utils.load_entityset("./featuretools_part_1/") print(es) label_times = utils.make_labels(es=es, product_name="Banana", cutoff_time=pd.Timestamp('March 15, 2015'), prediction_window=ft.Timedelta("4 weeks"), training_window=ft.Timedelta("60 days")) feature_matrix, features = ft.dfs( target_entity="users", cutoff_time=label_times, training_window=ft.Timedelta("60 days"), # same as above entityset=es, verbose=True) # Encode categorical values fm_encoded, features_encoded = ft.encode_features(feature_matrix, features) print("Number of features %s" % len(features_encoded)) print(features_encoded) # Sample the feature by user input # Train the classifier
def main(users_from, users_till): # ### DEFINE THE PIPELINE PARAMETERS # In[2]: show_report = False save_model = True # the timeframe of extracted users # users_from = '2016-10-01' # users_till = '2017-09-30' cohort_size = 3000 # the timeframe of extracted behavioral data interval = '3 weeks' # the type of the prediction problem # 'regression', 'binary classification', 'multiclass classification' prediction_problem_type = 'binary classification' # multiclass values medium_value = 5 high_value = 50 # number of the most important features to extract number_of_features = 20 print("Pipeline parameters defined") # ### CONNECT TO THE DATABASE # In[3]: conn, cur = utils.connect_to_db() # ### BUILD ENTITY TABLES AND LABELS # #### Cohorts entity # In[4]: cohorts = utils_bux.build_cohorts_entity(cur=cur, users_from=users_from, users_till=users_till) # #### Users entity # In[5]: users = utils_bux.build_users_entity(cur=cur, users_from=users_from, users_till=users_till, interval=interval, cohorts=cohorts, cohort_size=cohort_size) # #### Transactions entity # In[6]: transactions = utils_bux.build_transactions_entity(cur=cur, interval=interval) # #### Labels # In[7]: labels = utils_bux.build_target_values(cur=cur, medium_value=medium_value, high_value=high_value) # ### CREATE THE ENTITY SET # In[8]: es = utils_bux.create_bux_entity_set(cohorts, users, transactions) es # ### FEATURE ENGINEERING (DFS) FOR ALL FEATURES # In[9]: from featuretools.primitives import (Sum, Std, Max, Min, Mean, Count, PercentTrue, NUnique, Day, Week, Month, Weekday, Weekend) trans_primitives = [Day, Week, Month, Weekday, Weekend] agg_primitives = [Sum, Std, Max, Min, Mean, Count, PercentTrue, NUnique] fm_encoded, features_encoded = utils.calculate_feature_matrix( es, "users", trans_primitives=trans_primitives, agg_primitives=agg_primitives, max_depth=2) X = fm_encoded.reset_index().merge(labels) # ### TRAINING ON ALL FEATURES # In[10]: # define the labels based on the prediction problem type X, y = utils.make_labels(X, prediction_problem_type) # split the data into training and testing X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # train the model model = utils.rf_train(X_train, y_train, prediction_problem_type) # extract the most important features top_features = utils.feature_importances(model, features_encoded, n=number_of_features) # save the top features ft.save_features(top_features, "top_features") print("All features built and the most important features saved") # ### FEATURE ENGINEERING (DFS) FOR TOP FEATURES # In[11]: fm = utils.calculate_feature_matrix_top_features(es, top_features) X = fm.reset_index().merge(labels) print("Top features built") # ### TRAINING AND PREDICTION ON TOP FEATURES # In[12]: # define the labels based on the prediction problem type X, y = utils.make_labels(X, prediction_problem_type) # split the data into training and testing X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # fit the model model = utils.rf_train(X_train, y_train, prediction_problem_type) print("Model trained on top features") # ### SAVE THE MODEL # In[13]: if save_model == True: joblib.dump(model, 'models/model.pkl') print("Model saved") else: print("Model not saved") # ### REPORT # In[ ]: if show_report: utils.show_report(model, X, y, X_train, y_train, X_test, y_test, prediction_problem_type, top_features)