def compute_learning_curves(self): """ Cross validation with 100 iterations to get train and test score curves, each time with 20% data randomly selected as a validation set. """ if self.config.get('GEN_FILENAME') is None: print('No filename to use for computing errors, exiting...') return if self.config.get('CLASSIFIER') is None: print('No classifier to train') return classifier_name = self.config.get('CLASSIFIER') classifier = create_classifier(classifier_name, None) X_gan_final, y_gan_final, _, _ = self.gan_prediction_last_step(classifier) crossval = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) my_plt = plot_learning_curve(classifier, 'Initial learning curves', self.X_train_df, self.y_train.ravel(), ylim=None, cv=crossval, n_jobs=4) my_plt.show() my_plt = plot_learning_curve(classifier, 'Augmented data learning curves', X_gan_final, y_gan_final.ravel(), ylim=None, cv=crossval, n_jobs=4) my_plt.show()
def compare_classifier_gan_scores(self): """ Compare accuracy, precision, recall and F1 scores """ if self.config.get('GEN_FILENAME') is None: print('No filename to use for comparison, exiting...') return if self.config.get('CLASSIFIER') is None and self.config.get('SAMPLER') is None: print('Both classifier and sampler are not specified, exiting...') return if self.config.get('SAMPLER') is None: classifier_name = self.config.get('CLASSIFIER') else: classifier_name = self.config.get('SAMPLER') classifier = load_classifier(classifier_name, self.config.get('CACHE_FOLDER'), self.config.get('APP')) scores_baseline = classifier.decision_function(self.X_test) y_pred_baseline = classifier.predict(self.X_test) classifier_name = self.config.get('CLASSIFIER') classifier = create_classifier(classifier_name, None) _, _, y_pred_gan, _ = self.gan_prediction_last_step(classifier) scores_gan = classifier.decision_function(self.X_test) parameters = [self.y_test, y_pred_baseline, scores_baseline, y_pred_gan, scores_gan] plot_metrics(parameters)
def create_embeddings(self): """ Create embeddings """ X_real_non_fraud = self.train_df[0:self.config.get('AUGMENTED_DATA_SIZE')] X_real_fraud = self.train_df[self.train_df.Class == self.config.get('FAKE')] X_real_data = pd.concat([X_real_non_fraud, X_real_fraud]) classifier_name = self.config.get('CLASSIFIER') classifier = create_classifier(classifier_name, None) if self.config.get('GEN_FILENAME') is None: print('No filename specify for comparison, exiting...') return _, _, _, gen_data_last_step = self.gan_prediction_last_step(classifier) X_gen_fraud = gen_data_last_step[gen_data_last_step.Class == self.config.get('FAKE')] X_gen_fraud = X_gen_fraud[0:self.config.get('AUGMENTED_DATA_SIZE') // 2] """ To visualize differently generated from real data """ X_gen_fraud['Class'] = 2 print('X_real_data={}, X_real_fraud={}, X_gen_fraud={}' .format(X_real_data.shape, X_real_fraud.shape, X_gen_fraud.shape)) X_combined = pd.concat([X_real_data, X_gen_fraud]) tf_data = tf.Variable(X_combined.drop(columns=self.config.get('CLASS_NAME')), name='data') with open(Path(self.config.get('EMB_FOLDER') + 'metadata_data.csv'), 'w') as f: f.write('Class' + '\t' + 'Name' + '\n') for idx, row_df in X_combined.iterrows(): value = row_df[self.config.get('CLASS_NAME')] if value == self.config.get('REAL'): f.write('Normal' + '\t' + 'Normal\n') elif value == self.config.get('FAKE'): f.write('Fraud' + '\t' + 'Fraud\n') elif value == 2: f.write('Gen' + '\t' + 'Gen\n') print("done creating metadata tsv") ## Running TensorFlow Session print("Generating embeddings") with tf.Session() as sess: saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) saver.save(sess, str(Path(self.config.get('EMB_FOLDER') + 'embedding_data.ckpt'))) config = projector.ProjectorConfig() # One can add multiple embeddings. embedding = config.embeddings.add() embedding.tensor_name = tf_data.name # Link this tensor to its metadata(Labels) file embedding.metadata_path = 'metadata_data.csv' # Saves a config file that TensorBoard will read during startup. projector.visualize_embeddings(tf.summary.FileWriter(str(Path(self.config.get('EMB_FOLDER')))), config)
def plot_augmented_data_learning_curves(self): """ Plot learning curves on REAL + FAKE data set""" classifier_name = self.config.get('CLASSIFIER') classifier = create_classifier(classifier_name, None) gan_X = self.load_fake_data(False) last_step = self.config.get('TOTAL_TRAINING_STEPS') final_step = pd.DataFrame(gan_X[last_step], columns=self.non_corr_column_names) final_step[self.config.get('CLASS_NAME')] = self.config.get('FAKE') start = self.config.get('AUGMENTED_DATA_SIZE') * 0.1 stop = self.config.get('AUGMENTED_DATA_SIZE') sample_idx = np.linspace(start, stop, self.config.get('NUM_TRAINING_STEPS')) accuracy_scores = [] precision_scores = [] recall_scores = [] f1_scores = [] for idx in sample_idx: intidx = int(idx) print('Generating data for {}'.format(intidx)) sub_gen_data = final_step[0:intidx] gan_data = pd.concat([self.train_df, sub_gen_data], axis='rows') X_gan = gan_data.drop(columns=self.config.get('CLASS_NAME')).values y_gan = gan_data[self.config.get('CLASS_NAME')].values # Perform classification classifier.fit(X_gan, y_gan) y_pred = classifier.predict(self.X_test_df.values) acc = accuracy_score(y_pred, self.y_test) accuracy_scores.append(acc) precision = precision_score(y_pred, self.y_test) precision_scores.append(precision) recall = recall_score(y_pred, self.y_test) recall_scores.append(recall) f1_score = compute_F1(precision, recall) f1_scores.append(f1_score) plot_scores(sample_idx, accuracy_scores, precision_scores, recall_scores, f1_scores)
def generate_distribution_plots(self): """ Plot PCAs or t-sne equivalent of Fraud/Non-Fraud for real and augmented data sets""" if self.config.get('CLASSIFIER') is None: print('No classifier to use for comparison, exiting...') return target_names = self.train_df[self.config.get('CLASS_NAME')].map( lambda x: 'Non-Fraud' if x == 0 else 'Fraud').unique() plt.figure(figsize=(6, 5)) plt.subplot(2, 1, 1) if self.config.get('PCA') is not None: parameters = [self.config.get('SEED'), self.X_train, self.y_train.reshape(-1, ), None, None, target_names, 'PCA of Fraud/Non-Fraud real data'] realmin, realmax = plot_pca(parameters) else: parameters = [self.X_train[0:5000], self.y_train.reshape(-1, ), target_names, 'PCA of Fraud/Non-Fraud real data'] plot_tsne(parameters) classifier_name = self.config.get('CLASSIFIER') classifier = create_classifier(classifier_name, None) if self.config.get('GEN_FILENAME') is None: print('No filename specify for comparison, exiting...') return X_gan_final, y_gan_final, _, _ = self.gan_prediction_last_step(classifier) plt.subplot(2, 1, 2) if self.config.get('PCA') is not None: parameters = [self.config.get('SEED'), X_gan_final, y_gan_final.reshape(-1, ), realmin, realmax, target_names, 'PCA of Fraud/Non-Fraud augmented data'] plot_pca(parameters) else: parameters = [X_gan_final[0:5000], y_gan_final.reshape(-1, ), target_names, 'PCA of Fraud/Non-Fraud augmented data'] plot_tsne(parameters) plt.show()
def run_train_classifier(self): """ Train classifier or sampled classifier """ if self.config.get('CLASSIFIER') is None and self.config.get('SAMPLER') is None: print('No classifier to train') return classifier_name = self.config.get('CLASSIFIER') classifier = create_classifier(classifier_name, None) print("Training {} features with classifier {}".format(self.X_train.shape[1], classifier_name)) self.train_classifier(classifier) if self.config.get('SAMPLER') is None: print('No sampler to train') return sampler_name = self.config.get('SAMPLER') desired_num_samples = np.sum(self.y_train) + self.config.get('AUGMENTED_DATA_SIZE') imbalanced_learn_classifier = create_imbalanced_learn_classifier(classifier, sampler_name, {1: desired_num_samples}) print("Imbalanced learning classifier {} samples to {} observations".format(sampler_name, desired_num_samples)) self.train_classifier(imbalanced_learn_classifier)
def plot_decision_boundaries_random_dataset(self): """ Plot decision boundaries for SVM ovr, ovo and SVNM trained on augmented data sets. This method provides a visual comparison of performance improvements of an SVM classifier trained on a balanced data set. """ self.config.set('CLASSIFIER', 'SVC') self.config.set('TOTAL_TRAINING_STEPS', 10) self.config.set('AUGMENTED_DATA_SIZE', 500) self.create_random_data_set() fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(8, 12)) svm = create_classifier(self.config.get('CLASSIFIER'), False) svm.C = 10 svm.decision_function_shape = 'ovr' clf = svm.fit(self.X_rd, self.y_rd) plot_decision_function(self.X_rd, self.y_rd, clf, ax1) ax1.set_title('OVR SVM') svm = create_classifier(self.config.get('CLASSIFIER'), False) svm.C = .5 svm.decision_function_shape = 'ovo' clf = svm.fit(self.X_rd, self.y_rd) plot_decision_function(self.X_rd, self.y_rd, clf, ax2) ax2.set_title('OVO SVM') batch_size = int(self.rd_data_samples / self.rd_data_classes) train_df = pd.DataFrame(self.X_rd) non_corr_column_names = train_df.columns train_df[self.config.get('CLASS_NAME')] = self.y_rd self.train_df = train_df gan_data = train_df.copy() for class_index in range(self.rd_data_classes - 1): print('Generating data for class={}'.format(class_index)) configuration = GANConfiguration() configuration.name = self.config.get('GAN_NAME') configuration.batch_size = batch_size configuration.X_nodes = self.rd_data_features configuration.y_output = self.config.get('Y_OUTPUT') configuration.z_dims = self.config.get('Z_DIM') gan = create_gan(configuration) parameters = [gan, class_index, False] gan_X = self.train_gan(parameters) last_step = self.config.get('TOTAL_TRAINING_STEPS') final_step = pd.DataFrame(gan_X[last_step], columns=non_corr_column_names) final_step[self.config.get('CLASS_NAME')] = class_index gan_data = pd.concat([gan_data, final_step], axis='rows') del gan print(str(gan_data.groupby(self.config.get('CLASS_NAME'))[self.config.get('CLASS_NAME')].count())) X_gan = gan_data.drop(columns=self.config.get('CLASS_NAME')).values y_gan = gan_data[self.config.get('CLASS_NAME')].values svm = create_classifier(self.config.get('CLASSIFIER'), False) svm.C = .5 svm.decision_function_shape = 'ovo' clf = svm.fit(X_gan, y_gan) plot_decision_function(X_gan, y_gan, clf, ax3) ax3.set_title('OVO SVM + Augmented Data ') fig.tight_layout() plt.show()
def augmented_data_model_scores_report(self): """ Scores the classifier on augmented data set: REAL + FAKE DATA""" if self.config.get('CLASSIFIER') is None: print('No classifier to use for predictions, exiting...') return classifier_name = self.config.get('CLASSIFIER') classifier = create_classifier(classifier_name, None) total_steps = self.config.get('TOTAL_TRAINING_STEPS') # Loading generated data print("Loading data for total_count={}".format(total_steps)) gan_X = self.load_fake_data(False) if self.config.get('SAMPLE'): # GAN Losses gan_loss = self.load_fake_data(True) # Extract generator and discriminator losses from 'gan' gen_losses = [gen[1] for gen in gan_loss] disc_losses_fraud = [disc[0] for disc in gan_loss] # Obtain steady step for the GAN # Stabilise within 5% of total_steps ran, all total_steps within this frame must have losses fluctuating about 0.75 s.d. dloss_sdy = compute_steady_frame(disc_losses_fraud, num_steps_ran=total_steps, sd_fluc_pct=0.75, scan_frame=int(total_steps * 0.05), stab_fn=np.median) gloss_sdy = compute_steady_frame(gen_losses, num_steps_ran=total_steps, sd_fluc_pct=0.75, scan_frame=int(total_steps * 0.05), stab_fn=np.median) if dloss_sdy is not None and gloss_sdy is not None: sdy_count = int(max(max(dloss_sdy), max(gloss_sdy))) elif dloss_sdy is not None: sdy_count = dloss_sdy elif gloss_sdy is not None: sdy_count = gloss_sdy else: sdy_count = None if sdy_count is not None: print('Steady count: {}'.format(sdy_count)) print('\n', '############################################# STEADY COUNT #############################################') steady_frame = pd.DataFrame(gan_X[sdy_count], columns=self.non_corr_column_names) steady_frame[self.config.get('CLASS_NAME')] = self.config.get('FAKE') gan_data_steady = pd.concat([self.train_df, steady_frame], axis='rows') X_gan_steady = gan_data_steady.drop(columns=self.config.get('CLASS_NAME')).values y_gan_steady = gan_data_steady[self.config.get('CLASS_NAME')].values # Perform classification # Fit and obtain predictions classifier.fit(X_gan_steady, y_gan_steady) y_pred_gan_steady = classifier.predict(self.X_test_df.values) pred_score_gan_steady = classifier.score(self.X_test_df.values, self.y_test) print('{} Prediction Metrics'.format(self.config.get('GAN_NAME'))) parameters = [self.y_test, y_pred_gan_steady, pred_score_gan_steady, False] report_scores(parameters) print('\n', '############################################# FINAL COUNT #############################################') final_step = pd.DataFrame(gan_X[total_steps], columns=self.non_corr_column_names) final_step[self.config.get('CLASS_NAME')] = self.config.get('FAKE') gan_data_final = pd.concat([self.train_df, final_step], axis='rows') X_gan_final = gan_data_final.drop(columns=self.config.get('CLASS_NAME')).values y_gan_final = gan_data_final[self.config.get('CLASS_NAME')].values # Perform classification # Fit and obtain predictions classifier.fit(X_gan_final, y_gan_final) y_pred_gan_final = classifier.predict(self.X_test_df.values) pred_score_gan_final = classifier.score(self.X_test_df.values, self.y_test) print('Final Count {} Prediction Metrics'.format(self.config.get('GAN_NAME'))) parameters = [self.y_test, y_pred_gan_final, pred_score_gan_final, False] report_scores(parameters)