class TensorflowTrainer: GRAYSCALE_CHANNELS, RGB_CHANNELS = 1, 3 VERBOSE = 1 WORKERS = 3 fully_connected_layers = [1024, 512, 256] def __init__(self, arguments, model_name, base_model): self.__cnvrg_env = True self.__arguments = cast_input_types(arguments) self.__shape = (arguments.image_height, arguments.image_width) self.__classes = parse_classes(arguments.data) self.__channels = TensorflowTrainer.RGB_CHANNELS if arguments.image_color == 'rgb' \ else TensorflowTrainer.GRAYSCALE_CHANNELS self.__model = ModelGenerator( base_model=base_model, num_of_classes=len(self.__classes), fully_connected_layers=TensorflowTrainer.fully_connected_layers, loss_function=arguments.loss, dropout=arguments.dropout, activation_hidden_layers=arguments.hidden_layer_activation, activation_output_layers=arguments.output_layer_activation, optimizer=arguments.optimizer).get_model() try: self.__experiment = Experiment() except cnvrg.modules.UserError: self.__cnvrg_env = False self.__metrics = { 'tensorflow local version': tf.__version__, 'GPUs found': len(tf.config.experimental.list_physical_devices('GPU')), 'Model': model_name, 'Classes list': self.__classes } def run(self): if self.__cnvrg_env: self.__plot_all(status='pre-training') ### using cnvrg. self.__train() self.__test() if self.__cnvrg_env: self.__plot_all() ### using cnvrg. self.__export_model() ### using cnvrg. def __plot_all(self, status='post-test'): if status == 'pre-training': self.__plot_metrics(status='pre-training') elif status == 'post-test' and self.__arguments.data_test is not None: self.__plot_metrics(status='post-test') self.__plot_confusion_matrix(self.__labels, self.__predictions) def __train(self): train_generator, val_generator = load_generator( self.__arguments.data, self.__shape, self.__arguments.test_size, self.__arguments.image_color, self.__arguments.batch_size) steps_per_epoch_training = self.__arguments.steps_per_epoch steps_per_epoch_validation = self.__arguments.steps_per_epoch start_time = time.time() time_callback = TimeHistory() print("---start training---") self.__model.fit(train_generator, epochs=self.__arguments.epochs, workers=multiprocessing.cpu_count() - 1, verbose=TensorflowTrainer.VERBOSE, steps_per_epoch=steps_per_epoch_training, validation_data=val_generator, validation_steps=steps_per_epoch_validation, use_multiprocessing=True, callbacks=[time_callback]) print("---End training---") training_time = time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)) self.__metrics['training_time'] = training_time if self.__cnvrg_env: self.__experiment.log_metric( key="Epoch Times", Ys=time_callback.times, Xs=[i for i in range(1, self.__arguments.epochs + 1)], x_axis="Epoch", y_axis="Time (Seconds)") def __test(self): if self.__arguments.data_test is None: return test_gen = load_generator(self.__arguments.data_test, self.__shape, image_color=self.__arguments.image_color, batch_size=self.__arguments.batch_size, generate_test_set=True) self.__predictions = np.argmax(self.__model.predict(test_gen), axis=1) self.__labels = test_gen.classes steps_per_epoch_testing = test_gen.n test_loss, test_acc = self.__model.evaluate_generator( test_gen, workers=TensorflowTrainer.WORKERS, verbose=TensorflowTrainer.VERBOSE, steps=steps_per_epoch_testing) test_acc, test_loss = round(float(test_acc), 3), round(float(test_loss), 3) self.__metrics['test_acc'] = test_acc self.__metrics['test_loss'] = test_loss def __export_model(self): output_file_name = os.environ.get("CNVRG_WORKDIR") + "/" + self.__arguments.output_model if os.environ.get("CNVRG_WORKDIR") is not None \ else self.__arguments.output_model self.__model.save(output_file_name) export_labels_dictionary_from_classes_list(self.__classes) """ Cnvrg metrics output """ def __plot_metrics(self, status='pre-training'): """ :param training_status: (String) either 'pre' or 'post'. """ if status == 'pre-training': print('Plotting pre-training metrics:') for k, v in self.__metrics.items(): if k not in ['test_acc', 'test_loss']: self.__experiment.log_param(k, v) elif status == 'post-test': print('Plotting post-test metrics:') for k, v in self.__metrics.items(): if k in ['test_acc', 'test_loss']: self.__experiment.log_param(k, v) else: raise ValueError('Unrecognized status.') def __plot_confusion_matrix(self, labels, predictions): """ Plots the confusion matrix. """ confusion_mat_test = confusion_matrix(labels, predictions) # array confusion_mat_test = TensorflowTrainer.__helper_plot_confusion_matrix( confusion_mat_test, mat_x_ticks=self.__classes, mat_y_ticks=self.__classes) self.__experiment.log_chart("confusion matrix", data=Heatmap(z=confusion_mat_test)) @staticmethod def __helper_plot_confusion_matrix(confusion_matrix, mat_x_ticks=None, mat_y_ticks=None, digits_to_round=3): """ :param confusion_matrix: the values in the matrix. :param mat_x_ticks, mat_y_ticks: ticks for the axis of the matrix. """ output = [] for y in range(len(confusion_matrix)): for x in range(len(confusion_matrix[y])): x_val = x if mat_x_ticks is None else mat_x_ticks[x] y_val = y if mat_y_ticks is None else mat_y_ticks[y] output.append((x_val, y_val, round(float(confusion_matrix[x][y]), digits_to_round))) return output
class SKTrainerRegression: DIGITS_TO_ROUND = 3 REGRESSION_TYPE = ['linear', 'logistic'] def __init__(self, model, train_set, test_set, output_model_name, testing_mode, folds=None, regression_type=0): self.__model = model self.__x_train, self.__y_train = train_set self.__train_set_size = len(self.__y_train) self.__x_test, self.__y_test = test_set self.__test_set_size = len(self.__y_test) self.__testing_mode = testing_mode self.__cross_val_folds = folds self.__is_cross_val = (folds is not None) self.__features = list(self.__x_train.columns) self.__labels = [str(l) for l in list(set(self.__y_train).union(set(self.__y_test)))] self.__metrics = {'model': output_model_name} self.__y_pred = None self.__experiment = Experiment() self.__regression_type = SKTrainerRegression.REGRESSION_TYPE[regression_type] self.__coef, self.__intercept = None, None def run(self): self.__model.fit(self.__x_train, self.__y_train) try: self.__coef = self.__model.coef_ except AttributeError: pass try: self.__intercept = self.__model.intercept_ except AttributeError: pass if self.__is_cross_val: self.__metrics['folds'] = self.__cross_val_folds if self.__is_cross_val is True: self.__train_with_cross_validation() else: self.__train_without_cross_validation() self.__save_model() def __plot_all(self, y_test_pred): self.__plot_accuracies_and_errors() # self.__plot_regression_function() self.__plot_feature_importance() self.__plot_correlation_matrix() # self.__plot_feature_vs_feature() def __train_with_cross_validation(self): """ This method enables sk-learn algorithms to perform KFold-cross-validation. The method also initiates the cnvrg experiment with all its metrics. """ scores = cross_validate(estimator=self.__model, X=self.__x_train, y=self.__y_train, cv=self.__cross_val_folds, return_train_score=True, scoring=['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2', 'accuracy'], return_estimator=True) train_err_cv_mse = (-1) * scores['train_neg_mean_squared_error'] train_err_cv_mae = (-1) * scores['train_neg_mean_absolute_error'] train_err_cv_r2 = scores['train_r2'] val_acc_cv = scores['test_accuracy'] val_err_cv_mse = (-1) * scores['test_neg_mean_squared_error'] val_err_cv_mae = (-1) * scores['test_neg_mean_absolute_error'] val_err_cv_r2 = scores['test_r2'] self.__model = scores['estimator'][-1] self.__y_pred = self.__model.predict(self.__x_test) test_acc = accuracy_score(self.__y_test, self.__y_pred) test_loss = mean_squared_error(self.__y_test, self.__y_pred) self.__metrics.update({ 'train_loss_mae': train_err_cv_mae, 'train_loss_mse': train_err_cv_mse, 'train_loss_r2': train_err_cv_r2, 'validation_acc': val_acc_cv, 'val_loss_mae': val_err_cv_mae, 'val_loss_mse': val_err_cv_mse, 'val_loss_r2': val_err_cv_r2, 'test_acc': test_acc, 'test_loss_mse': test_loss}) self.__plot_all(self.__y_pred) def __train_without_cross_validation(self): """ The method also initiates the cnvrg experiment with all its metrics. """ y_hat = self.__model.predict(self.__x_train) # y_hat is a.k.a y_pred train_loss_MSE = mean_squared_error(self.__y_train, y_hat) train_loss_MAE = mean_absolute_error(self.__y_train, y_hat) train_loss_R2 = r2_score(self.__y_train, y_hat) self.__y_pred = self.__model.predict(self.__x_test) test_loss_MSE = mean_squared_error(self.__y_test, self.__y_pred) test_loss_MAE = mean_absolute_error(self.__y_test, self.__y_pred) test_loss_R2 = r2_score(self.__y_test, self.__y_pred) self.__metrics.update({ 'train_loss_mae': train_loss_MAE, 'train_loss_mse': train_loss_MSE, 'train_loss_r2': train_loss_R2, 'test_loss_mse': test_loss_MSE, 'test_loss_mae': test_loss_MAE, 'test_loss_r2': test_loss_R2}) self.__plot_all(self.__y_pred) def __plot_regression_function(self): if self.__regression_type == 'linear': a, b = self.__coef[0], self.__intercept x = np.linspace(-100, 100, 200) y = a * x + b elif self.__regression_type == 'logistic': x = np.linspace(-100, 100, 200) y = 1 / (1 + np.exp(-x)) self.__experiment.log_metric(key="Regression Function", Xs=x.tolist(), Ys=y.tolist(), grouping=['regression line'] * len(x)) def __plot_feature_importance(self): try: importance = getattr(self.__model, "feature_importances_") if self.__testing_mode is False: self.__experiment.log_chart('Feature Importance', x_axis='Features', y_axis='Importance', data=Bar(x=self.__features, y=importance)) else: print(importance) except AttributeError: pass def __plot_accuracies_and_errors(self): if self.__testing_mode is True: print("Model: {model}\n" "train_acc={train_acc}\n" "train_loss={train_loss}\n" "test_acc={test_acc}\n" "test_loss={test_loss}".format( model=self.__metrics['model'], train_acc=self.__metrics['train_acc'], train_loss=self.__metrics['train_loss'], test_acc=self.__metrics['test_acc'], test_loss=self.__metrics['test_loss'])) if self.__is_cross_val is True: print("Folds: {folds}\n".format(folds=self.__metrics['folds'])) else: # testing mode is off. for k, v in self.__metrics.items(): self.__plot_accuracies_and_errors_helper() if isinstance(v, list): self.__experiment.log_metric(k, v) else: self.__experiment.log_param(k, v) def __plot_accuracies_and_errors_helper(self): for k, v in self.__metrics.items(): if isinstance(v, float): self.__metrics[k] = round(self.__metrics[k], SKTrainerRegression.DIGITS_TO_ROUND) def __save_model(self): output_model_name = self.__metrics['model'] output_file_name = os.environ.get("CNVRG_WORKDIR") + "/" + output_model_name if os.environ.get("CNVRG_WORKDIR") \ is not None else output_model_name pickle.dump(self.__model, open(output_file_name, 'wb')) """training & testing methods""" def __plot_correlation_matrix(self): data = pd.concat([pd.concat([self.__x_train, self.__x_test], axis=0), pd.concat([self.__y_train, self.__y_test], axis=0)], axis=1) correlation = data.corr() self.__experiment.log_chart("correlation", [MatrixHeatmap(np.round(correlation.values, 2))], x_ticks=correlation.index.tolist(), y_ticks=correlation.index.tolist()) def __plot_feature_vs_feature(self): data = pd.concat([pd.concat([self.__x_train, self.__x_test], axis=0), pd.concat([self.__y_train, self.__y_test], axis=0)], axis=1) indexes = data.select_dtypes(include=["number"]).columns corr = data.corr() for idx, i in enumerate(indexes): for jdx, j in enumerate(indexes): if i == j: continue if jdx < idx: continue corr_val = abs(corr[i][j]) if 1 == corr_val or corr_val < 0.5: continue print("create", i, "against", j, "scatter chart") droplines = data[[i, j]].notnull().all(1) x, y = data[droplines][[i, j]].values.transpose() self.__experiment.log_chart("{i}_against_{j}".format(i=i, j=j), [Scatterplot(x=x.tolist(), y=y.tolist())], title="{i} against {j}".format(i=i, j=j))
class TensorflowTrainer: GRAYSCALE_CHANNELS = 1 RGB_CHANNELS = 3 VERBOSE = 1 WORKERS = 3 fully_connected_layers = [1024, 512, 256] METRICS = { 'pre-training': [ 'TensorFlow version', 'GPUs found', 'Model', # 'Classes list' ], 'post-training': [ 'training_time', # 'epochs_duration', # 'avg_time_per_epoch', # 'time_per_step' ], 'post-test': ['test_acc', 'test_loss'] } def __init__(self, arguments, model_name, base_model): self.__cnvrg_env = True self.__arguments = arguments self.__shape = (arguments.image_height, arguments.image_width) self.__classes = parse_classes(arguments.data) self.__channels = TensorflowTrainer.RGB_CHANNELS if arguments.image_color == 'rgb' \ else TensorflowTrainer.GRAYSCALE_CHANNELS self.__model = ModelGenerator( base_model=base_model, num_of_classes=len(self.__classes), fully_connected_layers=TensorflowTrainer.fully_connected_layers, loss_function=arguments.loss, dropout=arguments.dropout, activation_hidden_layers=arguments.hidden_layer_activation, activation_output_layers=arguments.output_layer_activation, optimizer=arguments.optimizer).get_model() try: print("Trying to launch an experiment in cnvrg environment.") self.__experiment = Experiment() except Exception: print("Not in cnvrg environment.") self.__cnvrg_env = False self.__metrics = { 'TensorFlow version': tf.__version__, 'GPUs found': len(tf.config.experimental.list_physical_devices('GPU')), 'Model': model_name, 'Classes list': self.__classes } def run(self): self.__plot(status='pre-training') self.__train() self.__plot(status='post-training') self.__test() self.__plot(status='post-test') self.__export_model() def __plot(self, status): if status == 'pre-training': self.__plot_metrics(status='pre-training') elif status == 'post-training': self.__plot_metrics(status='post-training') elif status == 'post-test' and self.__arguments.data_test is not None: self.__plot_metrics(status='post-test') self.__plot_confusion_matrix(self.__labels, self.__predictions) def __train(self): train_generator, val_generator = load_generator( self.__arguments.data, self.__shape, self.__arguments.test_size, # test_size = validation_split self.__arguments.image_color, self.__arguments.batch_size) start_time = time.time() time_callback = TimeHistory() print("--- Starts Training ---") from PIL import ImageFile ImageFile.LOAD_TRUNCATED_IMAGES = True self.__model.fit(train_generator, epochs=self.__arguments.epochs, verbose=self.__arguments.verbose, steps_per_epoch=self.__arguments.steps_per_epoch, validation_data=val_generator if self.__arguments.test_size != 0. else None, validation_steps=self.__arguments.steps_per_epoch if self.__arguments.test_size != 0. else None, callbacks=[time_callback]) print("--- Ends training ---") training_time = time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)) self.__metrics['training_time'] = training_time self.__metrics['epochs_duration'] = Metric(key='Epochs Duration', Ys=time_callback.times, Xs='from_1', x_axis='epochs', y_axis='time (seconds)') self.__metrics['avg_time_per_epoch'] = round( sum(time_callback.times) / len(time_callback.times), 3) if self.__arguments.steps_per_epoch is not None: self.__metrics['time_per_step'] = Metric( key='Time per Step', Ys=[ round( time_callback.times[i] / self.__arguments.steps_per_epoch, 3) for i in range(self.__arguments.epochs) ], Xs='from_1', x_axis='epochs', y_axis='time (ms)/step') def __test(self): if self.__arguments.data_test is None: return test_gen = load_generator(self.__arguments.data_test, self.__shape, image_color=self.__arguments.image_color, batch_size=self.__arguments.batch_size, generate_test_set=True) self.__predictions = np.argmax(self.__model.predict(test_gen), axis=1) self.__labels = test_gen.classes steps_per_epoch_testing = test_gen.n test_loss, test_acc = self.__model.evaluate_generator( test_gen, workers=TensorflowTrainer.WORKERS, verbose=TensorflowTrainer.VERBOSE, steps=steps_per_epoch_testing) test_acc, test_loss = round(float(test_acc), 3), round(float(test_loss), 3) self.__metrics['test_acc'] = test_acc self.__metrics['test_loss'] = test_loss def __export_model(self): output_file_name = os.environ.get("CNVRG_WORKDIR") + "/" + self.__arguments.output_model if os.environ.get("CNVRG_WORKDIR") is not None \ else self.__arguments.output_model self.__model.save(output_file_name) export_labels_dictionary_from_classes_list(self.__classes) # ============ Helpers ============ def __plot_metrics(self, status): metrics = TensorflowTrainer.METRICS[status] if status == 'pre-training': for metric in metrics: if self.__cnvrg_env: if metric in self.__metrics.keys(): # if metric exists self.__experiment.log_param(metric, self.__metrics[metric]) else: print("log_param - {key} : {value}".format( key=metric, value=self.__metrics[metric])) elif status == 'post-training': for metric in metrics: if metric in self.__metrics.keys(): # if metric exists if not isinstance(self.__metrics[metric], Metric): # param if self.__cnvrg_env: self.__experiment.log_param( metric, self.__metrics[metric]) else: print("log_param - {key} : {value}".format( key=metric, value=self.__metrics[metric])) else: # metrics should be called here. if self.__cnvrg_env: self.__experiment.log_metric( key=self.__metrics[metric].key, Ys=self.__metrics[metric].Ys, Xs=self.__metrics[metric].Xs, x_axis=self.__metrics[metric].x_axis, y_axis=self.__metrics[metric].y_axis) else: print(self.__metrics[metric]) elif status == 'post-test': for metric in metrics: if metric in self.__metrics.keys(): # if metric exists if self.__cnvrg_env: self.__experiment.log_param(metric, self.__metrics[metric]) else: print("log_param - {key} : {value}".format( key=metric, value=self.__metrics[metric])) else: raise ValueError('Unrecognized status.') def __plot_confusion_matrix(self, labels, predictions): """ Plots the confusion matrix. """ confusion_mat_test = confusion_matrix(labels, predictions) # array confusion_mat_test = TensorflowTrainer.__helper_plot_confusion_matrix( confusion_mat_test, mat_x_ticks=self.__classes, mat_y_ticks=self.__classes) self.__experiment.log_chart("confusion matrix", data=Heatmap(z=confusion_mat_test)) @staticmethod def __helper_plot_confusion_matrix(confusion_matrix, mat_x_ticks=None, mat_y_ticks=None, digits_to_round=3): """ :param confusion_matrix: the values in the matrix. :param mat_x_ticks, mat_y_ticks: ticks for the axis of the matrix. """ output = [] for y in range(len(confusion_matrix)): for x in range(len(confusion_matrix[y])): x_val = x if mat_x_ticks is None else mat_x_ticks[x] y_val = y if mat_y_ticks is None else mat_y_ticks[y] output.append((x_val, y_val, round(float(confusion_matrix[x][y]), digits_to_round))) return output
class SKTrainer: DIGITS_TO_ROUND = 3 def __init__(self, model, train_set, test_set, output_model_name, testing_mode, folds=None): self.__model = model self.__x_train, self.__y_train = train_set self.__x_test, self.__y_test = test_set self.__output_model_name = output_model_name self.__testing_mode = testing_mode self.__cross_val_folds = folds self.__is_cross_val = (folds is not None) self.__features = list(self.__x_train.columns) self.__labels = [str(l) for l in list(set(self.__y_train).union(set(self.__y_test)))] self.__model.fit(self.__x_train, self.__y_train) self.__importance = self.__model.feature_importances_ self.__experiment = Experiment() self.__metrics = {'model': self.__output_model_name} if self.__is_cross_val: self.__metrics['folds'] = self.__cross_val_folds def run(self): """ runs the training & testing methods. """ if self.__is_cross_val is True: self.__train_with_cross_validation() else: self.__train_without_cross_validation() """training & testing methods""" def __train_with_cross_validation(self): """ This method enables sk-learn algorithms to perform KFold-cross-validation. The method also initiates the cnvrg experiment with all its metrics. """ train_acc, train_loss = [], [] kf = KFold(n_splits=self.__cross_val_folds) for train_index, val_index in kf.split(self.__x_train): X_train, X_val = self.__x_train.iloc[train_index, :], self.__x_train.iloc[val_index, :] y_train, y_val = self.__y_train.iloc[train_index], self.__y_train.iloc[val_index] self.__model = self.__model.fit(X_train, y_train) y_hat = self.__model.predict(X_val) # y_hat is a.k.a y_pred acc = accuracy_score(y_val, y_hat) loss = mean_squared_error(y_val, y_hat) train_acc.append(acc) train_loss.append(loss) # --- Testing. y_pred = self.__model.predict(self.__x_test) test_acc = accuracy_score(self.__y_test, y_pred) test_loss = mean_squared_error(self.__y_test, y_pred) self.__metrics.update({ 'test_acc': test_acc, 'test_loss': test_loss }) self.__plot_all(y_pred) def __train_without_cross_validation(self): """ The method also initiates the cnvrg experiment with all its metrics. """ y_hat = self.__model.predict(self.__x_train) # y_hat is a.k.a y_pred train_acc = accuracy_score(self.__y_train, y_hat) train_loss = mean_squared_error(self.__y_train, y_hat) y_pred = self.__model.predict(self.__x_test) test_acc = accuracy_score(self.__y_test, y_pred) test_loss = mean_squared_error(self.__y_test, y_pred) self.__metrics.update({ 'train_acc': train_acc, 'train_loss': train_loss, 'test_acc': test_acc, 'test_loss': test_loss }) self.__plot_all(y_pred) """Plotting methods""" def __plot_feature_importance(self): if self.__testing_mode is False: self.__experiment.log_chart('Feature Importance', x_axis='Features', y_axis='Importance', data=Bar(x=self.__features, y=self.__importance)) else: print(self.__importance) def __plot_classification_report(self, y_test_pred): test_report = classification_report(self.__y_test, y_test_pred, output_dict=True) # dict if self.__testing_mode is False: testing_report_as_array = self.__helper_plot_classification_report(test_report) self.__experiment.log_chart("Test Set - Classification Report", data=Heatmap(z=testing_report_as_array), y_ticks=self.__labels, x_ticks=["precision", "recall", "f1-score", "support"]) else: print(test_report) def __helper_plot_classification_report(self, classification_report_dict): """ Converts dictionary given by classification_report to list of lists. """ rows = [] for k, v in classification_report_dict.items(): if k in self.__labels: rows.append(list(v.values())) values = [] for y in range(len(rows)): for x in range(len(rows[y])): values.append((x, y, round(rows[y][x], SKTrainer.DIGITS_TO_ROUND))) return values def __plot_confusion_matrix(self, y_test_pred=None): if self.__y_test is not None and y_test_pred is not None: confusion_mat_test = confusion_matrix(self.__y_test, y_test_pred) # array confusion_mat_test = self.__helper_plot_confusion_matrix(confusion_mat_test) if self.__testing_mode is False: self.__experiment.log_chart("Test Set - confusion matrix", data=Heatmap(z=confusion_mat_test)) else: print(confusion_mat_test) def __helper_plot_confusion_matrix(self, confusion_matrix): output = [] for y in range(len(confusion_matrix)): for x in range(len(confusion_matrix[y])): output.append((x, y, round(float(confusion_matrix[x][y]), SKTrainer.DIGITS_TO_ROUND))) return output def __plot_roc_curve(self, y_test_pred): n_classes = len(self.__labels) y_test = self.__y_test.tolist() y_test_pred = y_test_pred.tolist() if n_classes != 2 or self.__testing_mode is True: return y_test, y_test_pred = list(y_test), list(y_test_pred) FPRs, TPRs, _ = roc_curve(y_test, y_test_pred) self.__experiment.log_metric(key='ROC curve', Ys=TPRs.tolist(), Xs=FPRs.tolist()) def __plot_pandas_analyzer(self): data = pd.concat([pd.concat([self.__x_train, self.__x_test], axis=0), pd.concat([self.__y_train, self.__y_test], axis=0)], axis=1) if self.__testing_mode is False: PandasAnalyzer(data, experiment=self.__experiment) def __plot_accuracies_and_errors(self): self.__plot_accuracies_and_errors_helper() if self.__testing_mode is True: print("Model: {model}\n" "train_acc={train_acc}\n" "train_loss={train_loss}\n" "test_acc={test_acc}\n" "test_loss={test_loss}".format( model=self.__metrics['model'], train_acc=self.__metrics['train_acc'], train_loss=self.__metrics['train_loss'], test_acc=self.__metrics['test_acc'], test_loss=self.__metrics['test_loss'])) if self.__is_cross_val is True: print("Folds: {folds}\n".format(folds=self.__metrics['folds'])) else: # testing_mode is False self.__experiment.log_param("model", self.__metrics['model']) self.__experiment.log_param("test_acc", self.__metrics['test_acc']) self.__experiment.log_param("test_loss", self.__metrics['test_loss']) if self.__is_cross_val is True: self.__experiment.log_param("folds", self.__metrics['folds']) self.__experiment.log_metric("train_acc", self.__metrics['train_acc']) self.__experiment.log_metric("train_loss", self.__metrics['train_loss']) return self.__experiment.log_param("train_acc", self.__metrics['train_acc']) self.__experiment.log_param("train_loss", self.__metrics['train_loss']) def __plot_accuracies_and_errors_helper(self): """Rounds all the values in self.__metrics""" keys_to_round = ['train_acc', 'train_loss', 'test_acc', 'test_loss'] for key in keys_to_round: self.__metrics[key] = round(self.__metrics[key], SKTrainer.DIGITS_TO_ROUND) def __plot_all(self, y_test_pred): """ Runs all the plotting methods. """ self.__plot_pandas_analyzer() self.__plot_feature_importance() self.__plot_classification_report(y_test_pred=y_test_pred) self.__plot_confusion_matrix(y_test_pred=y_test_pred) self.__plot_roc_curve(y_test_pred=y_test_pred) self.__plot_accuracies_and_errors() self.__save_model() """technical methods""" def __save_model(self): output_file_name = os.environ.get("CNVRG_PROJECT_PATH") + "/" + self.__output_model_name if os.environ.get("CNVRG_PROJECT_PATH") \ is not None else self.__output_model_name pickle.dump(self.__model, open(output_file_name, 'wb')) if not self.__testing_mode: os.system("ls -la {}".format(os.environ.get("CNVRG_PROJECT_PATH")))
class SKTrainer: def __init__(self, model, train_set, test_set, output_model_name, testing_mode, folds=None): self.__model = model self.__x_train, self.__y_train = train_set self.__x_test, self.__y_test = test_set self.__all_data_concatenated = pd.concat([pd.concat([self.__x_train, self.__x_test], axis=0), pd.concat([self.__y_train, self.__y_test], axis=0)], axis=1) self.__testing_mode = testing_mode self.__cross_val_folds = folds self.__is_cross_val = (folds is not None) self.__features = list(self.__x_train.columns) self.__labels = [str(l) for l in list(set(self.__y_train).union(set(self.__y_test)))] self.__metrics = {'model': output_model_name, 'train set size': len(self.__y_train), 'test set size': len(self.__y_test)} self.__experiment = Experiment() def run(self): """ runs the training & testing methods. """ self.__model.fit(self.__x_train.values, self.__y_train.values) if self.__is_cross_val: self.__metrics['folds'] = self.__cross_val_folds if self.__is_cross_val is True: self.__train_with_cross_validation() else: self.__train_without_cross_validation() self.__save_model() def __plot_all(self, y_test_pred): """ This method controls the visualization and metrics outputs. Hashtag something which you don't want to plot. """ self.__plot_correlation_matrix() # self.__plot_feature_vs_feature() self.__plot_feature_importance() self.__plot_classification_report(y_test_pred=y_test_pred) self.__plot_confusion_matrix(y_test_pred=y_test_pred) self.__plot_roc_curve(y_test_pred=y_test_pred) self.__plot_accuracies_and_errors() """training & testing methods""" def __train_with_cross_validation(self): """ This method enables sk-learn algorithms to perform KFold-cross-validation. The method also initiates the cnvrg experiment with all its metrics. """ scores = cross_validate(estimator=self.__model, X=self.__x_train.values, y=self.__y_train.values, cv=self.__cross_val_folds, return_train_score=True, scoring=['neg_mean_squared_error', 'accuracy'], return_estimator=True) train_acc_cv = scores['train_accuracy'] train_err_cv = (-1) * scores['train_neg_mean_squared_error'] val_acc_cv = scores['test_accuracy'] val_err_cv = (-1) * scores['test_neg_mean_squared_error'] self.__model = scores['estimator'][-1] y_pred = self.__model.predict(self.__x_test.values) test_acc = accuracy_score(self.__y_test.values, y_pred) test_loss = zero_one_loss(self.__y_test.values, y_pred) self.__metrics.update({ 'train_acc': train_acc_cv, 'train_loss': train_err_cv, 'train_loss_type': 'MSE', 'validation_acc': val_acc_cv, 'validation_loss': val_err_cv, 'validation_loss_type': 'MSE', 'test_acc': test_acc, 'test_loss': test_loss, 'test_loss_type': 'zero_one_loss' }) self.__plot_all(y_pred) def __train_without_cross_validation(self): """ The method also initiates the cnvrg experiment with all its metrics. """ y_hat = self.__model.predict(self.__x_train.values) # y_hat is a.k.a y_pred train_acc = accuracy_score(self.__y_train, y_hat) train_loss = zero_one_loss(self.__y_train, y_hat) y_pred = self.__model.predict(self.__x_test.values) test_acc = accuracy_score(self.__y_test, y_pred) test_loss = zero_one_loss(self.__y_test, y_pred) self.__metrics.update({ 'train_acc': train_acc, 'train_loss': train_loss, 'train_loss_type': 'zero_one_loss', 'test_acc': test_acc, 'test_loss': test_loss, 'test_loss_type': 'zero_one_loss' }) self.__plot_all(y_pred) def __plot_feature_importance(self): try: importance = getattr(self.__model, "feature_importances_") if self.__testing_mode is False: self.__experiment.log_chart('Feature Importance', x_axis='Features', y_axis='Importance', data=Bar(x=self.__features, y=importance)) else: print(importance) except AttributeError: pass def __plot_classification_report(self, y_test_pred): test_report = classification_report(self.__y_test, y_test_pred, output_dict=True) # dict if self.__testing_mode is False: testing_report_as_array = self.__helper_plot_classification_report(test_report) self.__experiment.log_chart("Test Set - Classification Report", data=Heatmap(z=testing_report_as_array), y_ticks=self.__labels, x_ticks=["precision", "recall", "f1-score", "support"]) else: print(test_report) def __plot_confusion_matrix(self, y_test_pred=None): """ Plots the confusion matrix. """ if self.__y_test is not None and y_test_pred is not None: confusion_mat_test = confusion_matrix(self.__y_test, y_test_pred) # array confusion_mat_test = SKTrainer.__helper_plot_confusion_matrix(confusion_mat_test) if self.__testing_mode is False: self.__experiment.log_chart("Test Set - confusion matrix", data=Heatmap(z=confusion_mat_test)) else: print(confusion_mat_test) def __plot_roc_curve(self, y_test_pred): if len(set(self.__y_test)) != 2: return fpr, tpr, _ = roc_curve(self.__y_test, y_test_pred) if self.__testing_mode is False: self.__experiment.log_metric(key='ROC curve', Ys=tpr.tolist(), Xs=fpr.tolist()) else: print("FPRs: {fpr}\nTPRs: {tpr}".format(fpr=fpr, tpr=tpr)) def __plot_correlation_matrix(self): data = self.__all_data_concatenated correlation = data.corr() self.__experiment.log_chart("correlation", [MatrixHeatmap(np.round(correlation.values, 2))], x_ticks=correlation.index.tolist(), y_ticks=correlation.index.tolist()) def __plot_feature_vs_feature(self): data = self.__all_data_concatenated indexes = data.select_dtypes(include=["number"]).columns corr = data.corr() for idx, i in enumerate(indexes): for jdx, j in enumerate(indexes): if i == j: continue if jdx < idx: continue corr_val = abs(corr[i][j]) if 1 == corr_val or corr_val < 0.5: continue droplines = data[[i, j]].notnull().all(1) x, y = data[droplines][[i, j]].values.transpose() self.__experiment.log_chart("{i}_against_{j}".format(i=i, j=j), [Scatterplot(x=x.tolist(), y=y.tolist())], title="{i} against {j}".format(i=i, j=j)) def __plot_accuracies_and_errors(self): self.__plot_accuracies_and_errors_helper_rounding() if self.__testing_mode is True: self.__plot_accuracies_and_errors_helper_testing_mode() for p in ['model', 'test_acc', 'test_loss', 'test_loss_type', 'train set size', 'test set size', 'train_loss_type']: self.__experiment.log_param(p, self.__metrics[p]) if self.__is_cross_val is True: self.__experiment.log_param("folds", self.__metrics['folds']) self.__experiment.log_param("validation_loss_type", self.__metrics['validation_loss_type']) metrics = ['train_acc', 'train_loss', 'validation_acc', 'validation_loss'] for m in metrics: self.__experiment.log_metric(m, self.__metrics[m], grouping=[m] * len(self.__metrics[m])) return self.__experiment.log_param("train_acc", self.__metrics['train_acc']) self.__experiment.log_param("train_loss", self.__metrics['train_loss']) self.__experiment.log_param("train_loss_type", self.__metrics['train_loss_type']) def __save_model(self): output_model_name = self.__metrics['model'] output_file_name = os.environ.get("CNVRG_WORKDIR") + "/" + output_model_name if os.environ.get("CNVRG_WORKDIR") is not None else output_model_name pickle.dump(self.__model, open(output_file_name, 'wb')) """ --- Helpers --- """ @staticmethod def __helper_plot_confusion_matrix(confusion_matrix, digits_to_round=3): output = [] for y in range(len(confusion_matrix)): for x in range(len(confusion_matrix[y])): output.append((x, y, round(float(confusion_matrix[x][y]), digits_to_round))) return output def __plot_accuracies_and_errors_helper_rounding(self, digits_to_round=3): for key in self.__metrics.keys(): # Skip strings. if isinstance(self.__metrics[key], str): continue # Lists & Arrays. elif isinstance(self.__metrics[key], list) or isinstance(self.__metrics[key], np.ndarray): if isinstance(self.__metrics[key], np.ndarray): self.__metrics[key] = self.__metrics[key].tolist() for ind in range(len(self.__metrics[key])): self.__metrics[key][ind] = round(self.__metrics[key][ind], digits_to_round) # int & floats. else: self.__metrics[key] = round(self.__metrics[key], digits_to_round) def __plot_accuracies_and_errors_helper_testing_mode(self, digits_to_round=3): print("Model: {model}\n" "train_acc={train_acc}\n" "train_loss={train_loss}\n" "test_acc={test_acc}\n" "test_loss={test_loss}".format( model=self.__metrics['model'], train_acc=self.__metrics['train_acc'], train_loss=self.__metrics['train_loss'], test_acc=self.__metrics['test_acc'], test_loss=self.__metrics['test_loss'])) if self.__is_cross_val is True: print("Folds: {folds}\n".format(folds=self.__metrics['folds'])) def __helper_plot_classification_report(self, classification_report_dict, digits_to_round=3): """ Converts dictionary given by classification_report to list of lists. """ rows = [] for k, v in classification_report_dict.items(): if k in self.__labels: rows.append(list(v.values())) values = [] for y in range(len(rows)): for x in range(len(rows[y])): values.append((x, y, round(rows[y][x], digits_to_round))) return values
class CSVProcessor: def __init__(self, path_to_csv, target_column=None, missing_dict=None, scale_dict=None, normalize_list=None, one_hot_list=None, output_name=None, plot_vis=False): """ :param path_to_csv: string :param target_column: string :param missing_dict: dict :param scale_dict: dict :param normalize_list: list :param one_hot_list: list :param output_name: string """ self.__cnvrg_env = True ### When testing locally, it is turned False. self.__data = pd.read_csv(path_to_csv, index_col=0) self.__target_column = ( target_column, self.__data[target_column]) if target_column is not None else ( self.__data.columns[-1], self.__data[self.__data.columns[-1]]) self.__features = [ f for f in list(self.__data.columns) if f != self.__target_column[0] ] self.__data = self.__data[ self.__features] # removes the target column. try: self.__experiment = Experiment() except cnvrg.modules.errors.UserError: self.__cnvrg_env = False self.__normalize_list = CSVProcessor.__parse_list( normalize_list) if isinstance(normalize_list, str) else normalize_list self.__one_hot_list = CSVProcessor.__parse_list( one_hot_list) if isinstance(one_hot_list, str) else one_hot_list self.__output_name = output_name if output_name is not None else path_to_csv.split( '.csv')[0] + '_processed.csv' self.__plot_vis = plot_vis ### changed to list of lists instead of dictionary: self.__scale_dict = CSVProcessor.__parse_2d_list( scale_dict) if isinstance(scale_dict, str) else scale_dict self.__missing_dict = CSVProcessor.__parse_2d_list( missing_dict) if isinstance(missing_dict, str) else missing_dict def run(self): self.__handle_missing() self.__one_hot_encoding_aka_dummy() self.__scale() self.__normalize() self.__set_target_column() self.__save() if self.__cnvrg_env: self.__plot_metrics() ### using cnvrg. self.__plot_visualization(plot_correlation=True) ### using cnvrg. self.__check_nulls_before_output() def __scale(self): scale = lambda m, r_min, r_max, t_min, t_max: (( (m - r_min) / (r_max - r_min)) * (t_max - t_min)) + t_min if self.__scale_dict is not None: scale_all = False if set(self.__scale_dict.keys()) == set('all'): scale_all = True columns_to_scale = self.__features if scale_all is True else self.__scale_dict.keys( ) for col in columns_to_scale: y, x = (self.__data[col].min(), self.__data[col].max() ) if scale_all else CSVProcessor.__scale_helper( self.__scale_dict[col]) self.__data[col] = scale(self.__data[col], self.__data[col].min(), self.__data[col].max(), y, x) def __normalize(self): if self.__normalize_list is not None: normalize_all = False if set(self.__normalize_list) == set('all'): normalize_all = True columns_to_scale = self.__features if normalize_all is True else self.__normalize_list for col in columns_to_scale: min_range, max_range = self.__data[col].min( ), self.__data[col].max() self.__data[col] -= min_range self.__data[col] /= (max_range - min_range) def __one_hot_encoding_aka_dummy(self): """ Handles dummys. """ if self.__one_hot_list is not None: self.__data = pd.get_dummies(self.__data, columns=self.__one_hot_list) def __handle_missing(self): """ Options: 1) fill_X (fill with value x) 2) drop 3) avg (fill with avg) 4) med (short of median) 5) rand_A_B (fill with random value in range [A,B] """ if self.__missing_dict is not None: handle_all, task_all = False, None if set(self.__missing_dict.keys()) == set('all'): handle_all, task_all = True, self.__missing_dict['all'] column_to_handle = self.__features if handle_all is True else self.__missing_dict.keys( ) for col in column_to_handle: task = task_all if task_all is not None else self.__missing_dict[ col] if task.startswith('fill_'): value = float(task[len('fill_'):] ) if '.' in task[len('fill_'):] else int( task[len('fill_'):]) self.__data[col] = self.__data[col].fillna(value) elif task.startswith('drop'): self.__data = self.__data[self.__data[col].notna()] elif task.startswith('avg'): self.__data[col] = self.__data[col].fillna( self.__data[col].mean()) elif task.startswith('med'): self.__data[col] = self.__data[col].fillna( self.__data[col].median()) elif task.startswith('randint_'): a, b = task[len('randint_'):].split('_') a, b = float(a) if '.' in a else int(a), float( b) if '.' in b else int(b) self.__data[col] = self.__data[col].fillna( np.random.randint(a, b)) else: raise ValueError( 'Missing Values Handling - Undefined task.') def __set_target_column(self): self.__data[self.__target_column[0]] = self.__target_column[1] def __plot_metrics(self): self.__experiment.log_param("output_file", self.__output_name) def __plot_visualization(self, plot_correlation=True): if self.__plot_vis is False: return # Tasks: if plot_correlation: self.__plot_correlation_matrix() def __save(self): self.__data.to_csv(self.__output_name) def __check_nulls_before_output(self): # Check empty and nan values to warn the user. time.sleep(8) nulls_report = dict(self.__data.isnull().sum()) features_with_null_values = [ k for k, v in nulls_report.items() if v != 0 ] # if len(features_with_null_values) != 0: # warnings.warn("Null values or empty cells in the data set.", UserWarning) return """ ------------------- """ """ ----- Helpers ----- """ """ ------------------- """ @staticmethod def __parse_2d_list(as_string): final_dict = {} trimmed = as_string.replace(' ', '') commans_idxs = [0] + [ i for i in range(1, len(trimmed)) if trimmed[i] == ',' and trimmed[i - 1] == ']' and trimmed[i + 1] == '[' ] + [len(trimmed) - 1] ### if its 0, we have single array. sub_lists = [ trimmed[commans_idxs[i - 1] + 1:commans_idxs[i]] for i in range(1, len(commans_idxs)) ] if len(commans_idxs) > 2 else [trimmed[1:-1]] for sub_list in sub_lists: parsed = CSVProcessor.__parse_list(sub_list) try: final_dict[parsed[0]] = (parsed[1], parsed[2] ) ### for scaling. except IndexError: final_dict[parsed[0]] = parsed[ 1] ### for filling empty values. return final_dict @staticmethod def __parse_list(list_as_string): if list_as_string == '[]': return [] list_without_parenthesis = list_as_string.strip()[1:-1] parsed_list = [ st.strip() for st in list_without_parenthesis.split(',') ] # Check if the values are columns numbers. try: parsed_list = [int(st) for st in parsed_list] except ValueError: pass return parsed_list @staticmethod def __parse_dict(dict_as_string): if dict_as_string == '{}': return {} final_key = dict() parsed_dict = eval(dict_as_string) if not isinstance(parsed_dict, dict): raise TypeError('Given a {} instead of dictionary.'.format( type(parsed_dict))) all_keys = parsed_dict.keys() for k in all_keys: true_key, true_value = k, parsed_dict[k].split(':') true_key = true_key.strip() final_key[true_key] = true_value return final_key @staticmethod def __scale_helper(value): min_val, max_val = value.split(':') if isinstance( value, str) else value[0], value[1] min_val = float(min_val) if '.' in min_val else int(min_val) max_val = float(max_val) if '.' in max_val else int(max_val) return min_val, max_val def __plot_correlation_matrix(self, digits_to_round=3): correlation = self.__data.corr() self.__experiment.log_chart( "Correlation", [MatrixHeatmap(np.round(correlation.values, digits_to_round))], x_ticks=correlation.index.tolist(), y_ticks=correlation.index.tolist())