Example #1
0
def train_with_cross_validation(model, train_set, test_set, folds, project_dir,
                                output_model_name):
    train_acc, train_loss = [], []
    kf = KFold(n_splits=folds)
    X, y = train_set
    # --- Training.
    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        model.fit(X_train, y_train)
        model.n_estimators += 1
        y_hat = model.predict(X_val)  # y_hat is a.k.a y_pred
        acc = accuracy_score(y_val, y_hat)
        loss = mean_squared_error(y_val, y_hat)

        train_acc.append(acc)
        train_loss.append(loss)
    # --- Testing.
    X_test, y_test = test_set
    y_pred = model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred)
    test_loss = mean_squared_error(y_test, y_pred)

    exp = Experiment()
    exp.log_param("model", output_model_name)
    exp.log_param("folds", folds)
    exp.log_metric("train_acc", train_acc)
    exp.log_metric("train_loss", train_loss)
    exp.log_param("test_acc", test_acc)
    exp.log_param("test_loss", test_loss)

    # Save model.
    output_file_name = project_dir + "/" + output_model_name if project_dir is not None else output_model_name
    pickle.dump(model, open(output_file_name, 'wb'))
    def log_trial_result(self, iteration, trial, result):
        e = CNVRGExperiment(self._cnvrg_experiments[trial.trial_id])
        e.log(str(result))
        if self._cnvrg_metrics == []:
            self._cnvrg_metrics = [key for key in result]

        training_iteration = result['training_iteration']
        for key in self._cnvrg_metrics:
            try:
                value = float(result[key])
            except (ValueError, TypeError):
                continue
            e.log_metric(key, value, training_iteration)
Example #3
0
def train_with_cross_validation(model, train_set, test_set, folds, project_dir,
                                output_model_name):
    """
	This method enables sklearn algorithms to perform KFold-cross-validation.
	The method also initates the cnvrg.io experiment with all its metrics.
	:param model: SKlearn model object (initiated).
	:param train_set: tuple. (X_train, y_train). This is going to be used as a training set.
	:param test_set: tuple. (X_test, y_test). This is going to be used as a test set.
	:param folds: number of splits in the cross validation.
	:param project_dir: the path to the directory which indicates where to save the model.
	:param output_model_name: the name of the output model saved on the disk.
	:return: nothing.
	"""
    train_acc, train_loss = [], []
    kf = KFold(n_splits=folds)
    X, y = train_set

    # --- Training.
    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        model.fit(X_train, y_train)
        model.n_estimators += 1

        y_hat = model.predict(X_val)  # y_hat is a.k.a y_pred

        acc = accuracy_score(y_val, y_hat)
        loss = mean_squared_error(y_val, y_hat)

        train_acc.append(acc)
        train_loss.append(loss)

    # --- Testing.
    X_test, y_test = test_set
    y_pred = model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred)
    test_loss = mean_squared_error(y_test, y_pred)

    exp = Experiment()
    exp.log_param("model", output_model_name)
    exp.log_param("folds", folds)
    exp.log_metric("train_acc", train_acc)
    exp.log_metric("train_loss", train_loss)
    exp.log_param("test_acc", test_acc)
    exp.log_param("test_loss", test_loss)

    # Save model.
    output_file_name = project_dir + "/" + output_model_name if project_dir is not None else output_model_name
    pickle.dump(model, open(output_file_name, 'wb'))
Example #4
0
class TensorflowTrainer:
    GRAYSCALE_CHANNELS, RGB_CHANNELS = 1, 3
    VERBOSE = 1
    WORKERS = 3
    fully_connected_layers = [1024, 512, 256]

    def __init__(self, arguments, model_name, base_model):
        self.__cnvrg_env = True
        self.__arguments = cast_input_types(arguments)
        self.__shape = (arguments.image_height, arguments.image_width)
        self.__classes = parse_classes(arguments.data)
        self.__channels = TensorflowTrainer.RGB_CHANNELS if arguments.image_color == 'rgb' \
         else TensorflowTrainer.GRAYSCALE_CHANNELS
        self.__model = ModelGenerator(
            base_model=base_model,
            num_of_classes=len(self.__classes),
            fully_connected_layers=TensorflowTrainer.fully_connected_layers,
            loss_function=arguments.loss,
            dropout=arguments.dropout,
            activation_hidden_layers=arguments.hidden_layer_activation,
            activation_output_layers=arguments.output_layer_activation,
            optimizer=arguments.optimizer).get_model()
        try:
            self.__experiment = Experiment()
        except cnvrg.modules.UserError:
            self.__cnvrg_env = False
        self.__metrics = {
            'tensorflow local version': tf.__version__,
            'GPUs found':
            len(tf.config.experimental.list_physical_devices('GPU')),
            'Model': model_name,
            'Classes list': self.__classes
        }

    def run(self):
        if self.__cnvrg_env:
            self.__plot_all(status='pre-training')  ### using cnvrg.
        self.__train()
        self.__test()
        if self.__cnvrg_env:
            self.__plot_all()  ### using cnvrg.
            self.__export_model()  ### using cnvrg.

    def __plot_all(self, status='post-test'):
        if status == 'pre-training':
            self.__plot_metrics(status='pre-training')
        elif status == 'post-test' and self.__arguments.data_test is not None:
            self.__plot_metrics(status='post-test')
            self.__plot_confusion_matrix(self.__labels, self.__predictions)

    def __train(self):
        train_generator, val_generator = load_generator(
            self.__arguments.data, self.__shape, self.__arguments.test_size,
            self.__arguments.image_color, self.__arguments.batch_size)

        steps_per_epoch_training = self.__arguments.steps_per_epoch
        steps_per_epoch_validation = self.__arguments.steps_per_epoch

        start_time = time.time()
        time_callback = TimeHistory()

        print("---start training---")
        self.__model.fit(train_generator,
                         epochs=self.__arguments.epochs,
                         workers=multiprocessing.cpu_count() - 1,
                         verbose=TensorflowTrainer.VERBOSE,
                         steps_per_epoch=steps_per_epoch_training,
                         validation_data=val_generator,
                         validation_steps=steps_per_epoch_validation,
                         use_multiprocessing=True,
                         callbacks=[time_callback])
        print("---End training---")

        training_time = time.strftime("%H:%M:%S",
                                      time.gmtime(time.time() - start_time))
        self.__metrics['training_time'] = training_time

        if self.__cnvrg_env:
            self.__experiment.log_metric(
                key="Epoch Times",
                Ys=time_callback.times,
                Xs=[i for i in range(1, self.__arguments.epochs + 1)],
                x_axis="Epoch",
                y_axis="Time (Seconds)")

    def __test(self):
        if self.__arguments.data_test is None:
            return
        test_gen = load_generator(self.__arguments.data_test,
                                  self.__shape,
                                  image_color=self.__arguments.image_color,
                                  batch_size=self.__arguments.batch_size,
                                  generate_test_set=True)
        self.__predictions = np.argmax(self.__model.predict(test_gen), axis=1)
        self.__labels = test_gen.classes

        steps_per_epoch_testing = test_gen.n
        test_loss, test_acc = self.__model.evaluate_generator(
            test_gen,
            workers=TensorflowTrainer.WORKERS,
            verbose=TensorflowTrainer.VERBOSE,
            steps=steps_per_epoch_testing)
        test_acc, test_loss = round(float(test_acc),
                                    3), round(float(test_loss), 3)
        self.__metrics['test_acc'] = test_acc
        self.__metrics['test_loss'] = test_loss

    def __export_model(self):
        output_file_name = os.environ.get("CNVRG_WORKDIR") + "/" + self.__arguments.output_model if os.environ.get("CNVRG_WORKDIR") is not None \
         else self.__arguments.output_model
        self.__model.save(output_file_name)
        export_labels_dictionary_from_classes_list(self.__classes)

    """ Cnvrg metrics output """

    def __plot_metrics(self, status='pre-training'):
        """
		:param training_status: (String) either 'pre' or 'post'.
		"""
        if status == 'pre-training':
            print('Plotting pre-training metrics:')
            for k, v in self.__metrics.items():
                if k not in ['test_acc', 'test_loss']:
                    self.__experiment.log_param(k, v)
        elif status == 'post-test':
            print('Plotting post-test metrics:')
            for k, v in self.__metrics.items():
                if k in ['test_acc', 'test_loss']:
                    self.__experiment.log_param(k, v)
        else:
            raise ValueError('Unrecognized status.')

    def __plot_confusion_matrix(self, labels, predictions):
        """ Plots the confusion matrix. """
        confusion_mat_test = confusion_matrix(labels, predictions)  # array
        confusion_mat_test = TensorflowTrainer.__helper_plot_confusion_matrix(
            confusion_mat_test,
            mat_x_ticks=self.__classes,
            mat_y_ticks=self.__classes)
        self.__experiment.log_chart("confusion matrix",
                                    data=Heatmap(z=confusion_mat_test))

    @staticmethod
    def __helper_plot_confusion_matrix(confusion_matrix,
                                       mat_x_ticks=None,
                                       mat_y_ticks=None,
                                       digits_to_round=3):
        """
		:param confusion_matrix: the values in the matrix.
		:param mat_x_ticks, mat_y_ticks: ticks for the axis of the matrix.
		"""
        output = []
        for y in range(len(confusion_matrix)):
            for x in range(len(confusion_matrix[y])):
                x_val = x if mat_x_ticks is None else mat_x_ticks[x]
                y_val = y if mat_y_ticks is None else mat_y_ticks[y]
                output.append((x_val, y_val,
                               round(float(confusion_matrix[x][y]),
                                     digits_to_round)))
        return output
Example #5
0
class SKTrainerRegression:
	DIGITS_TO_ROUND = 3

	REGRESSION_TYPE = ['linear', 'logistic']

	def __init__(self, model, train_set, test_set, output_model_name, testing_mode, folds=None, regression_type=0):
		self.__model = model
		self.__x_train, self.__y_train = train_set
		self.__train_set_size = len(self.__y_train)
		self.__x_test, self.__y_test = test_set
		self.__test_set_size = len(self.__y_test)
		self.__testing_mode = testing_mode
		self.__cross_val_folds = folds
		self.__is_cross_val = (folds is not None)
		self.__features = list(self.__x_train.columns)
		self.__labels = [str(l) for l in list(set(self.__y_train).union(set(self.__y_test)))]
		self.__metrics = {'model': output_model_name}
		self.__y_pred = None
		self.__experiment = Experiment()
		self.__regression_type = SKTrainerRegression.REGRESSION_TYPE[regression_type]

		self.__coef, self.__intercept = None, None

	def run(self):
		self.__model.fit(self.__x_train, self.__y_train)

		try: self.__coef = self.__model.coef_
		except AttributeError: pass

		try: self.__intercept = self.__model.intercept_
		except AttributeError: pass

		if self.__is_cross_val:
			self.__metrics['folds'] = self.__cross_val_folds

		if self.__is_cross_val is True:
			self.__train_with_cross_validation()
		else:
			self.__train_without_cross_validation()
		self.__save_model()

	def __plot_all(self, y_test_pred):
		self.__plot_accuracies_and_errors()
		# self.__plot_regression_function()
		self.__plot_feature_importance()
		self.__plot_correlation_matrix()
		# self.__plot_feature_vs_feature()

	def __train_with_cross_validation(self):
		"""
		This method enables sk-learn algorithms to perform KFold-cross-validation.
		The method also initiates the cnvrg experiment with all its metrics.
		"""
		scores = cross_validate(estimator=self.__model,
								X=self.__x_train,
								y=self.__y_train,
								cv=self.__cross_val_folds,
								return_train_score=True,
								scoring=['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2', 'accuracy'],
								return_estimator=True)

		train_err_cv_mse = (-1) * scores['train_neg_mean_squared_error']
		train_err_cv_mae = (-1) * scores['train_neg_mean_absolute_error']
		train_err_cv_r2 = scores['train_r2']

		val_acc_cv = scores['test_accuracy']
		val_err_cv_mse = (-1) * scores['test_neg_mean_squared_error']
		val_err_cv_mae = (-1) * scores['test_neg_mean_absolute_error']
		val_err_cv_r2 = scores['test_r2']

		self.__model = scores['estimator'][-1]
		self.__y_pred = self.__model.predict(self.__x_test)
		test_acc = accuracy_score(self.__y_test, self.__y_pred)
		test_loss = mean_squared_error(self.__y_test, self.__y_pred)
		self.__metrics.update({
			'train_loss_mae': train_err_cv_mae,
			'train_loss_mse': train_err_cv_mse,
			'train_loss_r2': train_err_cv_r2,
			'validation_acc': val_acc_cv,
			'val_loss_mae': val_err_cv_mae,
			'val_loss_mse': val_err_cv_mse,
			'val_loss_r2': val_err_cv_r2,
			'test_acc': test_acc,
			'test_loss_mse': test_loss})
		self.__plot_all(self.__y_pred)

	def __train_without_cross_validation(self):
		"""
		The method also initiates the cnvrg experiment with all its metrics.
		"""
		y_hat = self.__model.predict(self.__x_train)  # y_hat is a.k.a y_pred

		train_loss_MSE = mean_squared_error(self.__y_train, y_hat)
		train_loss_MAE = mean_absolute_error(self.__y_train, y_hat)
		train_loss_R2 = r2_score(self.__y_train, y_hat)
		self.__y_pred = self.__model.predict(self.__x_test)
		test_loss_MSE = mean_squared_error(self.__y_test, self.__y_pred)
		test_loss_MAE = mean_absolute_error(self.__y_test, self.__y_pred)
		test_loss_R2 = r2_score(self.__y_test, self.__y_pred)
		self.__metrics.update({
			'train_loss_mae': train_loss_MAE,
			'train_loss_mse': train_loss_MSE,
			'train_loss_r2': train_loss_R2,
			'test_loss_mse': test_loss_MSE,
			'test_loss_mae': test_loss_MAE,
			'test_loss_r2': test_loss_R2})
		self.__plot_all(self.__y_pred)

	def __plot_regression_function(self):
		if self.__regression_type == 'linear':
			a, b = self.__coef[0], self.__intercept
			x = np.linspace(-100, 100, 200)
			y = a * x + b
		elif self.__regression_type == 'logistic':
			x = np.linspace(-100, 100, 200)
			y = 1 / (1 + np.exp(-x))
		self.__experiment.log_metric(key="Regression Function", Xs=x.tolist(), Ys=y.tolist(), grouping=['regression line'] * len(x))

	def __plot_feature_importance(self):
		try:
			importance = getattr(self.__model, "feature_importances_")
			if self.__testing_mode is False:
				self.__experiment.log_chart('Feature Importance', x_axis='Features', y_axis='Importance', data=Bar(x=self.__features, y=importance))
			else:
				print(importance)
		except AttributeError:
			pass

	def __plot_accuracies_and_errors(self):
		if self.__testing_mode is True:
			print("Model: {model}\n"
				  "train_acc={train_acc}\n"
				  "train_loss={train_loss}\n"
				  "test_acc={test_acc}\n"
				  "test_loss={test_loss}".format(
				model=self.__metrics['model'], train_acc=self.__metrics['train_acc'], train_loss=self.__metrics['train_loss'],
				test_acc=self.__metrics['test_acc'], test_loss=self.__metrics['test_loss']))
			if self.__is_cross_val is True:
				print("Folds: {folds}\n".format(folds=self.__metrics['folds']))
		else: # testing mode is off.
			for k, v in self.__metrics.items():
				self.__plot_accuracies_and_errors_helper()
				if isinstance(v, list):
					self.__experiment.log_metric(k, v)
				else:
					self.__experiment.log_param(k, v)

	def __plot_accuracies_and_errors_helper(self):
		for k, v in self.__metrics.items():
			if isinstance(v, float):
				self.__metrics[k] = round(self.__metrics[k], SKTrainerRegression.DIGITS_TO_ROUND)

	def __save_model(self):
		output_model_name = self.__metrics['model']
		output_file_name = os.environ.get("CNVRG_WORKDIR") + "/" + output_model_name if os.environ.get("CNVRG_WORKDIR") \
																				is not None else output_model_name
		pickle.dump(self.__model, open(output_file_name, 'wb'))

	"""training & testing methods"""

	def __plot_correlation_matrix(self):
		data = pd.concat([pd.concat([self.__x_train, self.__x_test], axis=0), pd.concat([self.__y_train, self.__y_test], axis=0)], axis=1)
		correlation = data.corr()
		self.__experiment.log_chart("correlation", [MatrixHeatmap(np.round(correlation.values, 2))],
									x_ticks=correlation.index.tolist(), y_ticks=correlation.index.tolist())

	def __plot_feature_vs_feature(self):
		data = pd.concat([pd.concat([self.__x_train, self.__x_test], axis=0), pd.concat([self.__y_train, self.__y_test], axis=0)], axis=1)
		indexes = data.select_dtypes(include=["number"]).columns
		corr = data.corr()
		for idx, i in enumerate(indexes):
			for jdx, j in enumerate(indexes):
				if i == j: continue
				if jdx < idx: continue
				corr_val = abs(corr[i][j])
				if 1 == corr_val or corr_val < 0.5: continue
				print("create", i, "against", j, "scatter chart")
				droplines = data[[i, j]].notnull().all(1)
				x, y = data[droplines][[i, j]].values.transpose()
				self.__experiment.log_chart("{i}_against_{j}".format(i=i, j=j),
											[Scatterplot(x=x.tolist(), y=y.tolist())],
											title="{i} against {j}".format(i=i, j=j))
Example #6
0
class TensorflowTrainer:
    GRAYSCALE_CHANNELS = 1
    RGB_CHANNELS = 3
    VERBOSE = 1
    WORKERS = 3
    fully_connected_layers = [1024, 512, 256]

    METRICS = {
        'pre-training': [
            'TensorFlow version',
            'GPUs found',
            'Model',
            # 'Classes list'
        ],
        'post-training': [
            'training_time',
            # 'epochs_duration',
            # 'avg_time_per_epoch',
            # 'time_per_step'
        ],
        'post-test': ['test_acc', 'test_loss']
    }

    def __init__(self, arguments, model_name, base_model):
        self.__cnvrg_env = True
        self.__arguments = arguments
        self.__shape = (arguments.image_height, arguments.image_width)
        self.__classes = parse_classes(arguments.data)
        self.__channels = TensorflowTrainer.RGB_CHANNELS if arguments.image_color == 'rgb' \
         else TensorflowTrainer.GRAYSCALE_CHANNELS
        self.__model = ModelGenerator(
            base_model=base_model,
            num_of_classes=len(self.__classes),
            fully_connected_layers=TensorflowTrainer.fully_connected_layers,
            loss_function=arguments.loss,
            dropout=arguments.dropout,
            activation_hidden_layers=arguments.hidden_layer_activation,
            activation_output_layers=arguments.output_layer_activation,
            optimizer=arguments.optimizer).get_model()
        try:
            print("Trying to launch an experiment in cnvrg environment.")
            self.__experiment = Experiment()
        except Exception:
            print("Not in cnvrg environment.")
            self.__cnvrg_env = False

        self.__metrics = {
            'TensorFlow version': tf.__version__,
            'GPUs found':
            len(tf.config.experimental.list_physical_devices('GPU')),
            'Model': model_name,
            'Classes list': self.__classes
        }

    def run(self):
        self.__plot(status='pre-training')

        self.__train()
        self.__plot(status='post-training')

        self.__test()
        self.__plot(status='post-test')

        self.__export_model()

    def __plot(self, status):
        if status == 'pre-training':
            self.__plot_metrics(status='pre-training')

        elif status == 'post-training':
            self.__plot_metrics(status='post-training')

        elif status == 'post-test' and self.__arguments.data_test is not None:
            self.__plot_metrics(status='post-test')
            self.__plot_confusion_matrix(self.__labels, self.__predictions)

    def __train(self):
        train_generator, val_generator = load_generator(
            self.__arguments.data,
            self.__shape,
            self.__arguments.test_size,  # test_size = validation_split
            self.__arguments.image_color,
            self.__arguments.batch_size)

        start_time = time.time()
        time_callback = TimeHistory()

        print("--- Starts Training ---")

        from PIL import ImageFile
        ImageFile.LOAD_TRUNCATED_IMAGES = True

        self.__model.fit(train_generator,
                         epochs=self.__arguments.epochs,
                         verbose=self.__arguments.verbose,
                         steps_per_epoch=self.__arguments.steps_per_epoch,
                         validation_data=val_generator
                         if self.__arguments.test_size != 0. else None,
                         validation_steps=self.__arguments.steps_per_epoch
                         if self.__arguments.test_size != 0. else None,
                         callbacks=[time_callback])

        print("--- Ends training ---")

        training_time = time.strftime("%H:%M:%S",
                                      time.gmtime(time.time() - start_time))
        self.__metrics['training_time'] = training_time
        self.__metrics['epochs_duration'] = Metric(key='Epochs Duration',
                                                   Ys=time_callback.times,
                                                   Xs='from_1',
                                                   x_axis='epochs',
                                                   y_axis='time (seconds)')
        self.__metrics['avg_time_per_epoch'] = round(
            sum(time_callback.times) / len(time_callback.times), 3)

        if self.__arguments.steps_per_epoch is not None:
            self.__metrics['time_per_step'] = Metric(
                key='Time per Step',
                Ys=[
                    round(
                        time_callback.times[i] /
                        self.__arguments.steps_per_epoch, 3)
                    for i in range(self.__arguments.epochs)
                ],
                Xs='from_1',
                x_axis='epochs',
                y_axis='time (ms)/step')

    def __test(self):
        if self.__arguments.data_test is None:
            return
        test_gen = load_generator(self.__arguments.data_test,
                                  self.__shape,
                                  image_color=self.__arguments.image_color,
                                  batch_size=self.__arguments.batch_size,
                                  generate_test_set=True)
        self.__predictions = np.argmax(self.__model.predict(test_gen), axis=1)
        self.__labels = test_gen.classes

        steps_per_epoch_testing = test_gen.n
        test_loss, test_acc = self.__model.evaluate_generator(
            test_gen,
            workers=TensorflowTrainer.WORKERS,
            verbose=TensorflowTrainer.VERBOSE,
            steps=steps_per_epoch_testing)

        test_acc, test_loss = round(float(test_acc),
                                    3), round(float(test_loss), 3)
        self.__metrics['test_acc'] = test_acc
        self.__metrics['test_loss'] = test_loss

    def __export_model(self):
        output_file_name = os.environ.get("CNVRG_WORKDIR") + "/" + self.__arguments.output_model if os.environ.get("CNVRG_WORKDIR") is not None \
         else self.__arguments.output_model
        self.__model.save(output_file_name)
        export_labels_dictionary_from_classes_list(self.__classes)

    # ============ Helpers ============

    def __plot_metrics(self, status):
        metrics = TensorflowTrainer.METRICS[status]

        if status == 'pre-training':
            for metric in metrics:
                if self.__cnvrg_env:
                    if metric in self.__metrics.keys():  # if metric exists
                        self.__experiment.log_param(metric,
                                                    self.__metrics[metric])
                else:
                    print("log_param -  {key} : {value}".format(
                        key=metric, value=self.__metrics[metric]))

        elif status == 'post-training':
            for metric in metrics:
                if metric in self.__metrics.keys():  # if metric exists
                    if not isinstance(self.__metrics[metric], Metric):  # param
                        if self.__cnvrg_env:
                            self.__experiment.log_param(
                                metric, self.__metrics[metric])
                        else:
                            print("log_param -  {key} : {value}".format(
                                key=metric, value=self.__metrics[metric]))
                    else:  # metrics should be called here.
                        if self.__cnvrg_env:
                            self.__experiment.log_metric(
                                key=self.__metrics[metric].key,
                                Ys=self.__metrics[metric].Ys,
                                Xs=self.__metrics[metric].Xs,
                                x_axis=self.__metrics[metric].x_axis,
                                y_axis=self.__metrics[metric].y_axis)
                        else:
                            print(self.__metrics[metric])

        elif status == 'post-test':
            for metric in metrics:
                if metric in self.__metrics.keys():  # if metric exists

                    if self.__cnvrg_env:
                        self.__experiment.log_param(metric,
                                                    self.__metrics[metric])
                    else:
                        print("log_param -  {key} : {value}".format(
                            key=metric, value=self.__metrics[metric]))

        else:
            raise ValueError('Unrecognized status.')

    def __plot_confusion_matrix(self, labels, predictions):
        """ Plots the confusion matrix. """
        confusion_mat_test = confusion_matrix(labels, predictions)  # array
        confusion_mat_test = TensorflowTrainer.__helper_plot_confusion_matrix(
            confusion_mat_test,
            mat_x_ticks=self.__classes,
            mat_y_ticks=self.__classes)
        self.__experiment.log_chart("confusion matrix",
                                    data=Heatmap(z=confusion_mat_test))

    @staticmethod
    def __helper_plot_confusion_matrix(confusion_matrix,
                                       mat_x_ticks=None,
                                       mat_y_ticks=None,
                                       digits_to_round=3):
        """
		:param confusion_matrix: the values in the matrix.
		:param mat_x_ticks, mat_y_ticks: ticks for the axis of the matrix.
		"""
        output = []
        for y in range(len(confusion_matrix)):
            for x in range(len(confusion_matrix[y])):
                x_val = x if mat_x_ticks is None else mat_x_ticks[x]
                y_val = y if mat_y_ticks is None else mat_y_ticks[y]
                output.append((x_val, y_val,
                               round(float(confusion_matrix[x][y]),
                                     digits_to_round)))
        return output
Example #7
0
class SKTrainer:
	DIGITS_TO_ROUND = 3

	def __init__(self, model, train_set, test_set, output_model_name, testing_mode, folds=None):
		self.__model = model
		self.__x_train, self.__y_train = train_set
		self.__x_test, self.__y_test = test_set
		self.__output_model_name = output_model_name
		self.__testing_mode = testing_mode
		self.__cross_val_folds = folds
		self.__is_cross_val = (folds is not None)
		self.__features = list(self.__x_train.columns)
		self.__labels = [str(l) for l in list(set(self.__y_train).union(set(self.__y_test)))]

		self.__model.fit(self.__x_train, self.__y_train)
		self.__importance = self.__model.feature_importances_

		self.__experiment = Experiment()

		self.__metrics = {'model': self.__output_model_name}
		if self.__is_cross_val:
			self.__metrics['folds'] = self.__cross_val_folds

	def run(self):
		""" runs the training & testing methods. """
		if self.__is_cross_val is True:
			self.__train_with_cross_validation()
		else:
			self.__train_without_cross_validation()

	"""training & testing methods"""

	def __train_with_cross_validation(self):
		"""
		This method enables sk-learn algorithms to perform KFold-cross-validation.
		The method also initiates the cnvrg experiment with all its metrics.
		"""
		train_acc, train_loss = [], []
		kf = KFold(n_splits=self.__cross_val_folds)

		for train_index, val_index in kf.split(self.__x_train):
			X_train, X_val = self.__x_train.iloc[train_index, :], self.__x_train.iloc[val_index, :]
			y_train, y_val = self.__y_train.iloc[train_index], self.__y_train.iloc[val_index]
			self.__model = self.__model.fit(X_train, y_train)

			y_hat = self.__model.predict(X_val)  # y_hat is a.k.a y_pred
			acc = accuracy_score(y_val, y_hat)
			loss = mean_squared_error(y_val, y_hat)

			train_acc.append(acc)
			train_loss.append(loss)

		# --- Testing.
		y_pred = self.__model.predict(self.__x_test)
		test_acc = accuracy_score(self.__y_test, y_pred)
		test_loss = mean_squared_error(self.__y_test, y_pred)
		self.__metrics.update({
			'test_acc': test_acc,
			'test_loss': test_loss
		})
		self.__plot_all(y_pred)

	def __train_without_cross_validation(self):
		"""
		The method also initiates the cnvrg experiment with all its metrics.
		"""
		y_hat = self.__model.predict(self.__x_train)  # y_hat is a.k.a y_pred

		train_acc = accuracy_score(self.__y_train, y_hat)
		train_loss = mean_squared_error(self.__y_train, y_hat)

		y_pred = self.__model.predict(self.__x_test)
		test_acc = accuracy_score(self.__y_test, y_pred)
		test_loss = mean_squared_error(self.__y_test, y_pred)
		self.__metrics.update({
			'train_acc': train_acc,
			'train_loss': train_loss,
			'test_acc': test_acc,
			'test_loss': test_loss
		})
		self.__plot_all(y_pred)

	"""Plotting methods"""

	def __plot_feature_importance(self):
		if self.__testing_mode is False:
			self.__experiment.log_chart('Feature Importance', x_axis='Features', y_axis='Importance', data=Bar(x=self.__features, y=self.__importance))
		else:
			print(self.__importance)

	def __plot_classification_report(self, y_test_pred):
		test_report = classification_report(self.__y_test, y_test_pred, output_dict=True)  # dict
		if self.__testing_mode is False:
			testing_report_as_array = self.__helper_plot_classification_report(test_report)
			self.__experiment.log_chart("Test Set - Classification Report", data=Heatmap(z=testing_report_as_array), y_ticks=self.__labels, x_ticks=["precision", "recall", "f1-score", "support"])
		else:
			print(test_report)

	def __helper_plot_classification_report(self, classification_report_dict):
		""" Converts dictionary given by classification_report to list of lists. """
		rows = []
		for k, v in classification_report_dict.items():
			if k in self.__labels:
				rows.append(list(v.values()))
		values = []
		for y in range(len(rows)):
			for x in range(len(rows[y])):
				values.append((x, y, round(rows[y][x], SKTrainer.DIGITS_TO_ROUND)))
		return values

	def __plot_confusion_matrix(self, y_test_pred=None):
		if self.__y_test is not None and y_test_pred is not None:
			confusion_mat_test = confusion_matrix(self.__y_test, y_test_pred)  # array
			confusion_mat_test = self.__helper_plot_confusion_matrix(confusion_mat_test)
			if self.__testing_mode is False:
				self.__experiment.log_chart("Test Set - confusion matrix", data=Heatmap(z=confusion_mat_test))
			else:
				print(confusion_mat_test)

	def __helper_plot_confusion_matrix(self, confusion_matrix):
		output = []
		for y in range(len(confusion_matrix)):
			for x in range(len(confusion_matrix[y])):
				output.append((x, y, round(float(confusion_matrix[x][y]), SKTrainer.DIGITS_TO_ROUND)))
		return output

	def __plot_roc_curve(self, y_test_pred):
		n_classes = len(self.__labels)
		y_test = self.__y_test.tolist()
		y_test_pred = y_test_pred.tolist()
		if n_classes != 2 or self.__testing_mode is True:
			return
		y_test, y_test_pred = list(y_test), list(y_test_pred)
		FPRs, TPRs, _ = roc_curve(y_test, y_test_pred)
		self.__experiment.log_metric(key='ROC curve', Ys=TPRs.tolist(), Xs=FPRs.tolist())

	def __plot_pandas_analyzer(self):
		data = pd.concat([pd.concat([self.__x_train, self.__x_test], axis=0), pd.concat([self.__y_train, self.__y_test], axis=0)], axis=1)
		if self.__testing_mode is False:
			PandasAnalyzer(data, experiment=self.__experiment)

	def __plot_accuracies_and_errors(self):
		self.__plot_accuracies_and_errors_helper()

		if self.__testing_mode is True:
			print("Model: {model}\n"
				  "train_acc={train_acc}\n"
				  "train_loss={train_loss}\n"
				  "test_acc={test_acc}\n"
				  "test_loss={test_loss}".format(
				model=self.__metrics['model'], train_acc=self.__metrics['train_acc'], train_loss=self.__metrics['train_loss'],
				test_acc=self.__metrics['test_acc'], test_loss=self.__metrics['test_loss']))
			if self.__is_cross_val is True:
				print("Folds: {folds}\n".format(folds=self.__metrics['folds']))

		else:  # testing_mode is False
			self.__experiment.log_param("model", self.__metrics['model'])
			self.__experiment.log_param("test_acc", self.__metrics['test_acc'])
			self.__experiment.log_param("test_loss", self.__metrics['test_loss'])
			if self.__is_cross_val is True:
				self.__experiment.log_param("folds", self.__metrics['folds'])
				self.__experiment.log_metric("train_acc", self.__metrics['train_acc'])
				self.__experiment.log_metric("train_loss", self.__metrics['train_loss'])
				return
			self.__experiment.log_param("train_acc", self.__metrics['train_acc'])
			self.__experiment.log_param("train_loss", self.__metrics['train_loss'])

	def __plot_accuracies_and_errors_helper(self):
		"""Rounds all the values in self.__metrics"""
		keys_to_round = ['train_acc', 'train_loss', 'test_acc', 'test_loss']
		for key in keys_to_round:
			self.__metrics[key] = round(self.__metrics[key], SKTrainer.DIGITS_TO_ROUND)

	def __plot_all(self, y_test_pred):
		"""
		Runs all the plotting methods.
		"""
		self.__plot_pandas_analyzer()
		self.__plot_feature_importance()
		self.__plot_classification_report(y_test_pred=y_test_pred)
		self.__plot_confusion_matrix(y_test_pred=y_test_pred)
		self.__plot_roc_curve(y_test_pred=y_test_pred)
		self.__plot_accuracies_and_errors()
		self.__save_model()

	"""technical methods"""

	def __save_model(self):
		output_file_name = os.environ.get("CNVRG_PROJECT_PATH") + "/" + self.__output_model_name if os.environ.get("CNVRG_PROJECT_PATH") \
																									is not None else self.__output_model_name
		pickle.dump(self.__model, open(output_file_name, 'wb'))
		if not self.__testing_mode:
			os.system("ls -la {}".format(os.environ.get("CNVRG_PROJECT_PATH")))
Example #8
0
class SKTrainer:
	def __init__(self, model, train_set, test_set, output_model_name, testing_mode, folds=None):
		self.__model = model
		self.__x_train, self.__y_train = train_set
		self.__x_test, self.__y_test = test_set
		self.__all_data_concatenated = pd.concat([pd.concat([self.__x_train, self.__x_test], axis=0),
												  pd.concat([self.__y_train, self.__y_test], axis=0)], axis=1)
		self.__testing_mode = testing_mode
		self.__cross_val_folds = folds
		self.__is_cross_val = (folds is not None)
		self.__features = list(self.__x_train.columns)
		self.__labels = [str(l) for l in list(set(self.__y_train).union(set(self.__y_test)))]
		self.__metrics = {'model': output_model_name, 'train set size': len(self.__y_train), 'test set size': len(self.__y_test)}
		self.__experiment = Experiment()

	def run(self):
		""" runs the training & testing methods. """
		self.__model.fit(self.__x_train.values, self.__y_train.values)

		if self.__is_cross_val: self.__metrics['folds'] = self.__cross_val_folds

		if self.__is_cross_val is True: self.__train_with_cross_validation()
		else: self.__train_without_cross_validation()

		self.__save_model()

	def __plot_all(self, y_test_pred):
		"""
		This method controls the visualization and metrics outputs.
		Hashtag something which you don't want to plot.
		"""
		self.__plot_correlation_matrix()
		# self.__plot_feature_vs_feature()
		self.__plot_feature_importance()
		self.__plot_classification_report(y_test_pred=y_test_pred)
		self.__plot_confusion_matrix(y_test_pred=y_test_pred)
		self.__plot_roc_curve(y_test_pred=y_test_pred)
		self.__plot_accuracies_and_errors()

	"""training & testing methods"""

	def __train_with_cross_validation(self):
		"""
		This method enables sk-learn algorithms to perform KFold-cross-validation.
		The method also initiates the cnvrg experiment with all its metrics.
		"""
		scores = cross_validate(estimator=self.__model,
								X=self.__x_train.values,
								y=self.__y_train.values,
								cv=self.__cross_val_folds,
								return_train_score=True,
								scoring=['neg_mean_squared_error', 'accuracy'],
								return_estimator=True)

		train_acc_cv = scores['train_accuracy']
		train_err_cv = (-1) * scores['train_neg_mean_squared_error']
		val_acc_cv = scores['test_accuracy']
		val_err_cv = (-1) * scores['test_neg_mean_squared_error']
		self.__model = scores['estimator'][-1]

		y_pred = self.__model.predict(self.__x_test.values)
		test_acc = accuracy_score(self.__y_test.values, y_pred)
		test_loss = zero_one_loss(self.__y_test.values, y_pred)
		self.__metrics.update({
			'train_acc': train_acc_cv,
			'train_loss': train_err_cv,
			'train_loss_type': 'MSE',
			'validation_acc': val_acc_cv,
			'validation_loss': val_err_cv,
			'validation_loss_type': 'MSE',
			'test_acc': test_acc,
			'test_loss': test_loss,
			'test_loss_type': 'zero_one_loss'
		})
		self.__plot_all(y_pred)

	def __train_without_cross_validation(self):
		"""
		The method also initiates the cnvrg experiment with all its metrics.
		"""
		y_hat = self.__model.predict(self.__x_train.values)  # y_hat is a.k.a y_pred

		train_acc = accuracy_score(self.__y_train, y_hat)
		train_loss = zero_one_loss(self.__y_train, y_hat)

		y_pred = self.__model.predict(self.__x_test.values)
		test_acc = accuracy_score(self.__y_test, y_pred)
		test_loss = zero_one_loss(self.__y_test, y_pred)
		self.__metrics.update({
			'train_acc': train_acc,
			'train_loss': train_loss,
			'train_loss_type': 'zero_one_loss',
			'test_acc': test_acc,
			'test_loss': test_loss,
			'test_loss_type': 'zero_one_loss'
		})
		self.__plot_all(y_pred)

	def __plot_feature_importance(self):
		try:
			importance = getattr(self.__model, "feature_importances_")
			if self.__testing_mode is False:
				self.__experiment.log_chart('Feature Importance', x_axis='Features', y_axis='Importance', data=Bar(x=self.__features, y=importance))
			else:
				print(importance)
		except AttributeError:
			pass

	def __plot_classification_report(self, y_test_pred):
		test_report = classification_report(self.__y_test, y_test_pred, output_dict=True)  # dict
		if self.__testing_mode is False:
			testing_report_as_array = self.__helper_plot_classification_report(test_report)
			self.__experiment.log_chart("Test Set - Classification Report", data=Heatmap(z=testing_report_as_array), y_ticks=self.__labels, x_ticks=["precision", "recall", "f1-score", "support"])
		else:
			print(test_report)

	def __plot_confusion_matrix(self, y_test_pred=None):
		""" Plots the confusion matrix. """
		if self.__y_test is not None and y_test_pred is not None:
			confusion_mat_test = confusion_matrix(self.__y_test, y_test_pred)  # array
			confusion_mat_test = SKTrainer.__helper_plot_confusion_matrix(confusion_mat_test)
			if self.__testing_mode is False:
				self.__experiment.log_chart("Test Set - confusion matrix", data=Heatmap(z=confusion_mat_test))
			else:
				print(confusion_mat_test)

	def __plot_roc_curve(self, y_test_pred):
		if len(set(self.__y_test)) != 2: return
		fpr, tpr, _ = roc_curve(self.__y_test, y_test_pred)
		if self.__testing_mode is False:
			self.__experiment.log_metric(key='ROC curve', Ys=tpr.tolist(), Xs=fpr.tolist())
		else: print("FPRs: {fpr}\nTPRs: {tpr}".format(fpr=fpr, tpr=tpr))

	def __plot_correlation_matrix(self):
		data = self.__all_data_concatenated
		correlation = data.corr()
		self.__experiment.log_chart("correlation", [MatrixHeatmap(np.round(correlation.values, 2))],
									x_ticks=correlation.index.tolist(), y_ticks=correlation.index.tolist())

	def __plot_feature_vs_feature(self):
		data = self.__all_data_concatenated
		indexes = data.select_dtypes(include=["number"]).columns
		corr = data.corr()
		for idx, i in enumerate(indexes):
			for jdx, j in enumerate(indexes):
				if i == j: continue
				if jdx < idx: continue
				corr_val = abs(corr[i][j])
				if 1 == corr_val or corr_val < 0.5: continue
				droplines = data[[i, j]].notnull().all(1)
				x, y = data[droplines][[i, j]].values.transpose()
				self.__experiment.log_chart("{i}_against_{j}".format(i=i, j=j),
											[Scatterplot(x=x.tolist(), y=y.tolist())],
											title="{i} against {j}".format(i=i, j=j))

	def __plot_accuracies_and_errors(self):
		self.__plot_accuracies_and_errors_helper_rounding()
		if self.__testing_mode is True: self.__plot_accuracies_and_errors_helper_testing_mode()

		for p in ['model', 'test_acc', 'test_loss', 'test_loss_type', 'train set size', 'test set size', 'train_loss_type']:
			self.__experiment.log_param(p, self.__metrics[p])

		if self.__is_cross_val is True:
			self.__experiment.log_param("folds", self.__metrics['folds'])
			self.__experiment.log_param("validation_loss_type", self.__metrics['validation_loss_type'])
			metrics = ['train_acc', 'train_loss', 'validation_acc', 'validation_loss']
			for m in metrics: self.__experiment.log_metric(m, self.__metrics[m], grouping=[m] * len(self.__metrics[m]))
			return

		self.__experiment.log_param("train_acc", self.__metrics['train_acc'])
		self.__experiment.log_param("train_loss", self.__metrics['train_loss'])
		self.__experiment.log_param("train_loss_type", self.__metrics['train_loss_type'])

	def __save_model(self):
		output_model_name = self.__metrics['model']
		output_file_name = os.environ.get("CNVRG_WORKDIR") + "/" + output_model_name if os.environ.get("CNVRG_WORKDIR") is not None else output_model_name
		pickle.dump(self.__model, open(output_file_name, 'wb'))

	""" --- Helpers --- """

	@staticmethod
	def __helper_plot_confusion_matrix(confusion_matrix, digits_to_round=3):
		output = []
		for y in range(len(confusion_matrix)):
			for x in range(len(confusion_matrix[y])):
				output.append((x, y, round(float(confusion_matrix[x][y]), digits_to_round)))
		return output

	def __plot_accuracies_and_errors_helper_rounding(self, digits_to_round=3):
		for key in self.__metrics.keys():
				# Skip strings.
				if isinstance(self.__metrics[key], str):
					continue
				# Lists & Arrays.
				elif isinstance(self.__metrics[key], list) or isinstance(self.__metrics[key], np.ndarray):
					if isinstance(self.__metrics[key], np.ndarray): self.__metrics[key] = self.__metrics[key].tolist()
					for ind in range(len(self.__metrics[key])):
						self.__metrics[key][ind] = round(self.__metrics[key][ind], digits_to_round)
				# int & floats.
				else:
					self.__metrics[key] = round(self.__metrics[key], digits_to_round)

	def __plot_accuracies_and_errors_helper_testing_mode(self, digits_to_round=3):
		print("Model: {model}\n"
			  "train_acc={train_acc}\n"
			  "train_loss={train_loss}\n"
			  "test_acc={test_acc}\n"
			  "test_loss={test_loss}".format(
			model=self.__metrics['model'], train_acc=self.__metrics['train_acc'], train_loss=self.__metrics['train_loss'],
			test_acc=self.__metrics['test_acc'], test_loss=self.__metrics['test_loss']))
		if self.__is_cross_val is True:
			print("Folds: {folds}\n".format(folds=self.__metrics['folds']))

	def __helper_plot_classification_report(self, classification_report_dict, digits_to_round=3):
		""" Converts dictionary given by classification_report to list of lists. """
		rows = []
		for k, v in classification_report_dict.items():
			if k in self.__labels:
				rows.append(list(v.values()))
		values = []
		for y in range(len(rows)):
			for x in range(len(rows[y])):
				values.append((x, y, round(rows[y][x], digits_to_round)))
		return values