예제 #1
0
class TensorflowTrainer:
    GRAYSCALE_CHANNELS, RGB_CHANNELS = 1, 3
    VERBOSE = 1
    WORKERS = 3
    fully_connected_layers = [1024, 512, 256]

    def __init__(self, arguments, model_name, base_model):
        self.__cnvrg_env = True
        self.__arguments = cast_input_types(arguments)
        self.__shape = (arguments.image_height, arguments.image_width)
        self.__classes = parse_classes(arguments.data)
        self.__channels = TensorflowTrainer.RGB_CHANNELS if arguments.image_color == 'rgb' \
         else TensorflowTrainer.GRAYSCALE_CHANNELS
        self.__model = ModelGenerator(
            base_model=base_model,
            num_of_classes=len(self.__classes),
            fully_connected_layers=TensorflowTrainer.fully_connected_layers,
            loss_function=arguments.loss,
            dropout=arguments.dropout,
            activation_hidden_layers=arguments.hidden_layer_activation,
            activation_output_layers=arguments.output_layer_activation,
            optimizer=arguments.optimizer).get_model()
        try:
            self.__experiment = Experiment()
        except cnvrg.modules.UserError:
            self.__cnvrg_env = False
        self.__metrics = {
            'tensorflow local version': tf.__version__,
            'GPUs found':
            len(tf.config.experimental.list_physical_devices('GPU')),
            'Model': model_name,
            'Classes list': self.__classes
        }

    def run(self):
        if self.__cnvrg_env:
            self.__plot_all(status='pre-training')  ### using cnvrg.
        self.__train()
        self.__test()
        if self.__cnvrg_env:
            self.__plot_all()  ### using cnvrg.
            self.__export_model()  ### using cnvrg.

    def __plot_all(self, status='post-test'):
        if status == 'pre-training':
            self.__plot_metrics(status='pre-training')
        elif status == 'post-test' and self.__arguments.data_test is not None:
            self.__plot_metrics(status='post-test')
            self.__plot_confusion_matrix(self.__labels, self.__predictions)

    def __train(self):
        train_generator, val_generator = load_generator(
            self.__arguments.data, self.__shape, self.__arguments.test_size,
            self.__arguments.image_color, self.__arguments.batch_size)

        steps_per_epoch_training = self.__arguments.steps_per_epoch
        steps_per_epoch_validation = self.__arguments.steps_per_epoch

        start_time = time.time()
        time_callback = TimeHistory()

        print("---start training---")
        self.__model.fit(train_generator,
                         epochs=self.__arguments.epochs,
                         workers=multiprocessing.cpu_count() - 1,
                         verbose=TensorflowTrainer.VERBOSE,
                         steps_per_epoch=steps_per_epoch_training,
                         validation_data=val_generator,
                         validation_steps=steps_per_epoch_validation,
                         use_multiprocessing=True,
                         callbacks=[time_callback])
        print("---End training---")

        training_time = time.strftime("%H:%M:%S",
                                      time.gmtime(time.time() - start_time))
        self.__metrics['training_time'] = training_time

        if self.__cnvrg_env:
            self.__experiment.log_metric(
                key="Epoch Times",
                Ys=time_callback.times,
                Xs=[i for i in range(1, self.__arguments.epochs + 1)],
                x_axis="Epoch",
                y_axis="Time (Seconds)")

    def __test(self):
        if self.__arguments.data_test is None:
            return
        test_gen = load_generator(self.__arguments.data_test,
                                  self.__shape,
                                  image_color=self.__arguments.image_color,
                                  batch_size=self.__arguments.batch_size,
                                  generate_test_set=True)
        self.__predictions = np.argmax(self.__model.predict(test_gen), axis=1)
        self.__labels = test_gen.classes

        steps_per_epoch_testing = test_gen.n
        test_loss, test_acc = self.__model.evaluate_generator(
            test_gen,
            workers=TensorflowTrainer.WORKERS,
            verbose=TensorflowTrainer.VERBOSE,
            steps=steps_per_epoch_testing)
        test_acc, test_loss = round(float(test_acc),
                                    3), round(float(test_loss), 3)
        self.__metrics['test_acc'] = test_acc
        self.__metrics['test_loss'] = test_loss

    def __export_model(self):
        output_file_name = os.environ.get("CNVRG_WORKDIR") + "/" + self.__arguments.output_model if os.environ.get("CNVRG_WORKDIR") is not None \
         else self.__arguments.output_model
        self.__model.save(output_file_name)
        export_labels_dictionary_from_classes_list(self.__classes)

    """ Cnvrg metrics output """

    def __plot_metrics(self, status='pre-training'):
        """
		:param training_status: (String) either 'pre' or 'post'.
		"""
        if status == 'pre-training':
            print('Plotting pre-training metrics:')
            for k, v in self.__metrics.items():
                if k not in ['test_acc', 'test_loss']:
                    self.__experiment.log_param(k, v)
        elif status == 'post-test':
            print('Plotting post-test metrics:')
            for k, v in self.__metrics.items():
                if k in ['test_acc', 'test_loss']:
                    self.__experiment.log_param(k, v)
        else:
            raise ValueError('Unrecognized status.')

    def __plot_confusion_matrix(self, labels, predictions):
        """ Plots the confusion matrix. """
        confusion_mat_test = confusion_matrix(labels, predictions)  # array
        confusion_mat_test = TensorflowTrainer.__helper_plot_confusion_matrix(
            confusion_mat_test,
            mat_x_ticks=self.__classes,
            mat_y_ticks=self.__classes)
        self.__experiment.log_chart("confusion matrix",
                                    data=Heatmap(z=confusion_mat_test))

    @staticmethod
    def __helper_plot_confusion_matrix(confusion_matrix,
                                       mat_x_ticks=None,
                                       mat_y_ticks=None,
                                       digits_to_round=3):
        """
		:param confusion_matrix: the values in the matrix.
		:param mat_x_ticks, mat_y_ticks: ticks for the axis of the matrix.
		"""
        output = []
        for y in range(len(confusion_matrix)):
            for x in range(len(confusion_matrix[y])):
                x_val = x if mat_x_ticks is None else mat_x_ticks[x]
                y_val = y if mat_y_ticks is None else mat_y_ticks[y]
                output.append((x_val, y_val,
                               round(float(confusion_matrix[x][y]),
                                     digits_to_round)))
        return output
예제 #2
0
class SKTrainerRegression:
	DIGITS_TO_ROUND = 3

	REGRESSION_TYPE = ['linear', 'logistic']

	def __init__(self, model, train_set, test_set, output_model_name, testing_mode, folds=None, regression_type=0):
		self.__model = model
		self.__x_train, self.__y_train = train_set
		self.__train_set_size = len(self.__y_train)
		self.__x_test, self.__y_test = test_set
		self.__test_set_size = len(self.__y_test)
		self.__testing_mode = testing_mode
		self.__cross_val_folds = folds
		self.__is_cross_val = (folds is not None)
		self.__features = list(self.__x_train.columns)
		self.__labels = [str(l) for l in list(set(self.__y_train).union(set(self.__y_test)))]
		self.__metrics = {'model': output_model_name}
		self.__y_pred = None
		self.__experiment = Experiment()
		self.__regression_type = SKTrainerRegression.REGRESSION_TYPE[regression_type]

		self.__coef, self.__intercept = None, None

	def run(self):
		self.__model.fit(self.__x_train, self.__y_train)

		try: self.__coef = self.__model.coef_
		except AttributeError: pass

		try: self.__intercept = self.__model.intercept_
		except AttributeError: pass

		if self.__is_cross_val:
			self.__metrics['folds'] = self.__cross_val_folds

		if self.__is_cross_val is True:
			self.__train_with_cross_validation()
		else:
			self.__train_without_cross_validation()
		self.__save_model()

	def __plot_all(self, y_test_pred):
		self.__plot_accuracies_and_errors()
		# self.__plot_regression_function()
		self.__plot_feature_importance()
		self.__plot_correlation_matrix()
		# self.__plot_feature_vs_feature()

	def __train_with_cross_validation(self):
		"""
		This method enables sk-learn algorithms to perform KFold-cross-validation.
		The method also initiates the cnvrg experiment with all its metrics.
		"""
		scores = cross_validate(estimator=self.__model,
								X=self.__x_train,
								y=self.__y_train,
								cv=self.__cross_val_folds,
								return_train_score=True,
								scoring=['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2', 'accuracy'],
								return_estimator=True)

		train_err_cv_mse = (-1) * scores['train_neg_mean_squared_error']
		train_err_cv_mae = (-1) * scores['train_neg_mean_absolute_error']
		train_err_cv_r2 = scores['train_r2']

		val_acc_cv = scores['test_accuracy']
		val_err_cv_mse = (-1) * scores['test_neg_mean_squared_error']
		val_err_cv_mae = (-1) * scores['test_neg_mean_absolute_error']
		val_err_cv_r2 = scores['test_r2']

		self.__model = scores['estimator'][-1]
		self.__y_pred = self.__model.predict(self.__x_test)
		test_acc = accuracy_score(self.__y_test, self.__y_pred)
		test_loss = mean_squared_error(self.__y_test, self.__y_pred)
		self.__metrics.update({
			'train_loss_mae': train_err_cv_mae,
			'train_loss_mse': train_err_cv_mse,
			'train_loss_r2': train_err_cv_r2,
			'validation_acc': val_acc_cv,
			'val_loss_mae': val_err_cv_mae,
			'val_loss_mse': val_err_cv_mse,
			'val_loss_r2': val_err_cv_r2,
			'test_acc': test_acc,
			'test_loss_mse': test_loss})
		self.__plot_all(self.__y_pred)

	def __train_without_cross_validation(self):
		"""
		The method also initiates the cnvrg experiment with all its metrics.
		"""
		y_hat = self.__model.predict(self.__x_train)  # y_hat is a.k.a y_pred

		train_loss_MSE = mean_squared_error(self.__y_train, y_hat)
		train_loss_MAE = mean_absolute_error(self.__y_train, y_hat)
		train_loss_R2 = r2_score(self.__y_train, y_hat)
		self.__y_pred = self.__model.predict(self.__x_test)
		test_loss_MSE = mean_squared_error(self.__y_test, self.__y_pred)
		test_loss_MAE = mean_absolute_error(self.__y_test, self.__y_pred)
		test_loss_R2 = r2_score(self.__y_test, self.__y_pred)
		self.__metrics.update({
			'train_loss_mae': train_loss_MAE,
			'train_loss_mse': train_loss_MSE,
			'train_loss_r2': train_loss_R2,
			'test_loss_mse': test_loss_MSE,
			'test_loss_mae': test_loss_MAE,
			'test_loss_r2': test_loss_R2})
		self.__plot_all(self.__y_pred)

	def __plot_regression_function(self):
		if self.__regression_type == 'linear':
			a, b = self.__coef[0], self.__intercept
			x = np.linspace(-100, 100, 200)
			y = a * x + b
		elif self.__regression_type == 'logistic':
			x = np.linspace(-100, 100, 200)
			y = 1 / (1 + np.exp(-x))
		self.__experiment.log_metric(key="Regression Function", Xs=x.tolist(), Ys=y.tolist(), grouping=['regression line'] * len(x))

	def __plot_feature_importance(self):
		try:
			importance = getattr(self.__model, "feature_importances_")
			if self.__testing_mode is False:
				self.__experiment.log_chart('Feature Importance', x_axis='Features', y_axis='Importance', data=Bar(x=self.__features, y=importance))
			else:
				print(importance)
		except AttributeError:
			pass

	def __plot_accuracies_and_errors(self):
		if self.__testing_mode is True:
			print("Model: {model}\n"
				  "train_acc={train_acc}\n"
				  "train_loss={train_loss}\n"
				  "test_acc={test_acc}\n"
				  "test_loss={test_loss}".format(
				model=self.__metrics['model'], train_acc=self.__metrics['train_acc'], train_loss=self.__metrics['train_loss'],
				test_acc=self.__metrics['test_acc'], test_loss=self.__metrics['test_loss']))
			if self.__is_cross_val is True:
				print("Folds: {folds}\n".format(folds=self.__metrics['folds']))
		else: # testing mode is off.
			for k, v in self.__metrics.items():
				self.__plot_accuracies_and_errors_helper()
				if isinstance(v, list):
					self.__experiment.log_metric(k, v)
				else:
					self.__experiment.log_param(k, v)

	def __plot_accuracies_and_errors_helper(self):
		for k, v in self.__metrics.items():
			if isinstance(v, float):
				self.__metrics[k] = round(self.__metrics[k], SKTrainerRegression.DIGITS_TO_ROUND)

	def __save_model(self):
		output_model_name = self.__metrics['model']
		output_file_name = os.environ.get("CNVRG_WORKDIR") + "/" + output_model_name if os.environ.get("CNVRG_WORKDIR") \
																				is not None else output_model_name
		pickle.dump(self.__model, open(output_file_name, 'wb'))

	"""training & testing methods"""

	def __plot_correlation_matrix(self):
		data = pd.concat([pd.concat([self.__x_train, self.__x_test], axis=0), pd.concat([self.__y_train, self.__y_test], axis=0)], axis=1)
		correlation = data.corr()
		self.__experiment.log_chart("correlation", [MatrixHeatmap(np.round(correlation.values, 2))],
									x_ticks=correlation.index.tolist(), y_ticks=correlation.index.tolist())

	def __plot_feature_vs_feature(self):
		data = pd.concat([pd.concat([self.__x_train, self.__x_test], axis=0), pd.concat([self.__y_train, self.__y_test], axis=0)], axis=1)
		indexes = data.select_dtypes(include=["number"]).columns
		corr = data.corr()
		for idx, i in enumerate(indexes):
			for jdx, j in enumerate(indexes):
				if i == j: continue
				if jdx < idx: continue
				corr_val = abs(corr[i][j])
				if 1 == corr_val or corr_val < 0.5: continue
				print("create", i, "against", j, "scatter chart")
				droplines = data[[i, j]].notnull().all(1)
				x, y = data[droplines][[i, j]].values.transpose()
				self.__experiment.log_chart("{i}_against_{j}".format(i=i, j=j),
											[Scatterplot(x=x.tolist(), y=y.tolist())],
											title="{i} against {j}".format(i=i, j=j))
예제 #3
0
class TensorflowTrainer:
    GRAYSCALE_CHANNELS = 1
    RGB_CHANNELS = 3
    VERBOSE = 1
    WORKERS = 3
    fully_connected_layers = [1024, 512, 256]

    METRICS = {
        'pre-training': [
            'TensorFlow version',
            'GPUs found',
            'Model',
            # 'Classes list'
        ],
        'post-training': [
            'training_time',
            # 'epochs_duration',
            # 'avg_time_per_epoch',
            # 'time_per_step'
        ],
        'post-test': ['test_acc', 'test_loss']
    }

    def __init__(self, arguments, model_name, base_model):
        self.__cnvrg_env = True
        self.__arguments = arguments
        self.__shape = (arguments.image_height, arguments.image_width)
        self.__classes = parse_classes(arguments.data)
        self.__channels = TensorflowTrainer.RGB_CHANNELS if arguments.image_color == 'rgb' \
         else TensorflowTrainer.GRAYSCALE_CHANNELS
        self.__model = ModelGenerator(
            base_model=base_model,
            num_of_classes=len(self.__classes),
            fully_connected_layers=TensorflowTrainer.fully_connected_layers,
            loss_function=arguments.loss,
            dropout=arguments.dropout,
            activation_hidden_layers=arguments.hidden_layer_activation,
            activation_output_layers=arguments.output_layer_activation,
            optimizer=arguments.optimizer).get_model()
        try:
            print("Trying to launch an experiment in cnvrg environment.")
            self.__experiment = Experiment()
        except Exception:
            print("Not in cnvrg environment.")
            self.__cnvrg_env = False

        self.__metrics = {
            'TensorFlow version': tf.__version__,
            'GPUs found':
            len(tf.config.experimental.list_physical_devices('GPU')),
            'Model': model_name,
            'Classes list': self.__classes
        }

    def run(self):
        self.__plot(status='pre-training')

        self.__train()
        self.__plot(status='post-training')

        self.__test()
        self.__plot(status='post-test')

        self.__export_model()

    def __plot(self, status):
        if status == 'pre-training':
            self.__plot_metrics(status='pre-training')

        elif status == 'post-training':
            self.__plot_metrics(status='post-training')

        elif status == 'post-test' and self.__arguments.data_test is not None:
            self.__plot_metrics(status='post-test')
            self.__plot_confusion_matrix(self.__labels, self.__predictions)

    def __train(self):
        train_generator, val_generator = load_generator(
            self.__arguments.data,
            self.__shape,
            self.__arguments.test_size,  # test_size = validation_split
            self.__arguments.image_color,
            self.__arguments.batch_size)

        start_time = time.time()
        time_callback = TimeHistory()

        print("--- Starts Training ---")

        from PIL import ImageFile
        ImageFile.LOAD_TRUNCATED_IMAGES = True

        self.__model.fit(train_generator,
                         epochs=self.__arguments.epochs,
                         verbose=self.__arguments.verbose,
                         steps_per_epoch=self.__arguments.steps_per_epoch,
                         validation_data=val_generator
                         if self.__arguments.test_size != 0. else None,
                         validation_steps=self.__arguments.steps_per_epoch
                         if self.__arguments.test_size != 0. else None,
                         callbacks=[time_callback])

        print("--- Ends training ---")

        training_time = time.strftime("%H:%M:%S",
                                      time.gmtime(time.time() - start_time))
        self.__metrics['training_time'] = training_time
        self.__metrics['epochs_duration'] = Metric(key='Epochs Duration',
                                                   Ys=time_callback.times,
                                                   Xs='from_1',
                                                   x_axis='epochs',
                                                   y_axis='time (seconds)')
        self.__metrics['avg_time_per_epoch'] = round(
            sum(time_callback.times) / len(time_callback.times), 3)

        if self.__arguments.steps_per_epoch is not None:
            self.__metrics['time_per_step'] = Metric(
                key='Time per Step',
                Ys=[
                    round(
                        time_callback.times[i] /
                        self.__arguments.steps_per_epoch, 3)
                    for i in range(self.__arguments.epochs)
                ],
                Xs='from_1',
                x_axis='epochs',
                y_axis='time (ms)/step')

    def __test(self):
        if self.__arguments.data_test is None:
            return
        test_gen = load_generator(self.__arguments.data_test,
                                  self.__shape,
                                  image_color=self.__arguments.image_color,
                                  batch_size=self.__arguments.batch_size,
                                  generate_test_set=True)
        self.__predictions = np.argmax(self.__model.predict(test_gen), axis=1)
        self.__labels = test_gen.classes

        steps_per_epoch_testing = test_gen.n
        test_loss, test_acc = self.__model.evaluate_generator(
            test_gen,
            workers=TensorflowTrainer.WORKERS,
            verbose=TensorflowTrainer.VERBOSE,
            steps=steps_per_epoch_testing)

        test_acc, test_loss = round(float(test_acc),
                                    3), round(float(test_loss), 3)
        self.__metrics['test_acc'] = test_acc
        self.__metrics['test_loss'] = test_loss

    def __export_model(self):
        output_file_name = os.environ.get("CNVRG_WORKDIR") + "/" + self.__arguments.output_model if os.environ.get("CNVRG_WORKDIR") is not None \
         else self.__arguments.output_model
        self.__model.save(output_file_name)
        export_labels_dictionary_from_classes_list(self.__classes)

    # ============ Helpers ============

    def __plot_metrics(self, status):
        metrics = TensorflowTrainer.METRICS[status]

        if status == 'pre-training':
            for metric in metrics:
                if self.__cnvrg_env:
                    if metric in self.__metrics.keys():  # if metric exists
                        self.__experiment.log_param(metric,
                                                    self.__metrics[metric])
                else:
                    print("log_param -  {key} : {value}".format(
                        key=metric, value=self.__metrics[metric]))

        elif status == 'post-training':
            for metric in metrics:
                if metric in self.__metrics.keys():  # if metric exists
                    if not isinstance(self.__metrics[metric], Metric):  # param
                        if self.__cnvrg_env:
                            self.__experiment.log_param(
                                metric, self.__metrics[metric])
                        else:
                            print("log_param -  {key} : {value}".format(
                                key=metric, value=self.__metrics[metric]))
                    else:  # metrics should be called here.
                        if self.__cnvrg_env:
                            self.__experiment.log_metric(
                                key=self.__metrics[metric].key,
                                Ys=self.__metrics[metric].Ys,
                                Xs=self.__metrics[metric].Xs,
                                x_axis=self.__metrics[metric].x_axis,
                                y_axis=self.__metrics[metric].y_axis)
                        else:
                            print(self.__metrics[metric])

        elif status == 'post-test':
            for metric in metrics:
                if metric in self.__metrics.keys():  # if metric exists

                    if self.__cnvrg_env:
                        self.__experiment.log_param(metric,
                                                    self.__metrics[metric])
                    else:
                        print("log_param -  {key} : {value}".format(
                            key=metric, value=self.__metrics[metric]))

        else:
            raise ValueError('Unrecognized status.')

    def __plot_confusion_matrix(self, labels, predictions):
        """ Plots the confusion matrix. """
        confusion_mat_test = confusion_matrix(labels, predictions)  # array
        confusion_mat_test = TensorflowTrainer.__helper_plot_confusion_matrix(
            confusion_mat_test,
            mat_x_ticks=self.__classes,
            mat_y_ticks=self.__classes)
        self.__experiment.log_chart("confusion matrix",
                                    data=Heatmap(z=confusion_mat_test))

    @staticmethod
    def __helper_plot_confusion_matrix(confusion_matrix,
                                       mat_x_ticks=None,
                                       mat_y_ticks=None,
                                       digits_to_round=3):
        """
		:param confusion_matrix: the values in the matrix.
		:param mat_x_ticks, mat_y_ticks: ticks for the axis of the matrix.
		"""
        output = []
        for y in range(len(confusion_matrix)):
            for x in range(len(confusion_matrix[y])):
                x_val = x if mat_x_ticks is None else mat_x_ticks[x]
                y_val = y if mat_y_ticks is None else mat_y_ticks[y]
                output.append((x_val, y_val,
                               round(float(confusion_matrix[x][y]),
                                     digits_to_round)))
        return output
예제 #4
0
class SKTrainer:
	DIGITS_TO_ROUND = 3

	def __init__(self, model, train_set, test_set, output_model_name, testing_mode, folds=None):
		self.__model = model
		self.__x_train, self.__y_train = train_set
		self.__x_test, self.__y_test = test_set
		self.__output_model_name = output_model_name
		self.__testing_mode = testing_mode
		self.__cross_val_folds = folds
		self.__is_cross_val = (folds is not None)
		self.__features = list(self.__x_train.columns)
		self.__labels = [str(l) for l in list(set(self.__y_train).union(set(self.__y_test)))]

		self.__model.fit(self.__x_train, self.__y_train)
		self.__importance = self.__model.feature_importances_

		self.__experiment = Experiment()

		self.__metrics = {'model': self.__output_model_name}
		if self.__is_cross_val:
			self.__metrics['folds'] = self.__cross_val_folds

	def run(self):
		""" runs the training & testing methods. """
		if self.__is_cross_val is True:
			self.__train_with_cross_validation()
		else:
			self.__train_without_cross_validation()

	"""training & testing methods"""

	def __train_with_cross_validation(self):
		"""
		This method enables sk-learn algorithms to perform KFold-cross-validation.
		The method also initiates the cnvrg experiment with all its metrics.
		"""
		train_acc, train_loss = [], []
		kf = KFold(n_splits=self.__cross_val_folds)

		for train_index, val_index in kf.split(self.__x_train):
			X_train, X_val = self.__x_train.iloc[train_index, :], self.__x_train.iloc[val_index, :]
			y_train, y_val = self.__y_train.iloc[train_index], self.__y_train.iloc[val_index]
			self.__model = self.__model.fit(X_train, y_train)

			y_hat = self.__model.predict(X_val)  # y_hat is a.k.a y_pred
			acc = accuracy_score(y_val, y_hat)
			loss = mean_squared_error(y_val, y_hat)

			train_acc.append(acc)
			train_loss.append(loss)

		# --- Testing.
		y_pred = self.__model.predict(self.__x_test)
		test_acc = accuracy_score(self.__y_test, y_pred)
		test_loss = mean_squared_error(self.__y_test, y_pred)
		self.__metrics.update({
			'test_acc': test_acc,
			'test_loss': test_loss
		})
		self.__plot_all(y_pred)

	def __train_without_cross_validation(self):
		"""
		The method also initiates the cnvrg experiment with all its metrics.
		"""
		y_hat = self.__model.predict(self.__x_train)  # y_hat is a.k.a y_pred

		train_acc = accuracy_score(self.__y_train, y_hat)
		train_loss = mean_squared_error(self.__y_train, y_hat)

		y_pred = self.__model.predict(self.__x_test)
		test_acc = accuracy_score(self.__y_test, y_pred)
		test_loss = mean_squared_error(self.__y_test, y_pred)
		self.__metrics.update({
			'train_acc': train_acc,
			'train_loss': train_loss,
			'test_acc': test_acc,
			'test_loss': test_loss
		})
		self.__plot_all(y_pred)

	"""Plotting methods"""

	def __plot_feature_importance(self):
		if self.__testing_mode is False:
			self.__experiment.log_chart('Feature Importance', x_axis='Features', y_axis='Importance', data=Bar(x=self.__features, y=self.__importance))
		else:
			print(self.__importance)

	def __plot_classification_report(self, y_test_pred):
		test_report = classification_report(self.__y_test, y_test_pred, output_dict=True)  # dict
		if self.__testing_mode is False:
			testing_report_as_array = self.__helper_plot_classification_report(test_report)
			self.__experiment.log_chart("Test Set - Classification Report", data=Heatmap(z=testing_report_as_array), y_ticks=self.__labels, x_ticks=["precision", "recall", "f1-score", "support"])
		else:
			print(test_report)

	def __helper_plot_classification_report(self, classification_report_dict):
		""" Converts dictionary given by classification_report to list of lists. """
		rows = []
		for k, v in classification_report_dict.items():
			if k in self.__labels:
				rows.append(list(v.values()))
		values = []
		for y in range(len(rows)):
			for x in range(len(rows[y])):
				values.append((x, y, round(rows[y][x], SKTrainer.DIGITS_TO_ROUND)))
		return values

	def __plot_confusion_matrix(self, y_test_pred=None):
		if self.__y_test is not None and y_test_pred is not None:
			confusion_mat_test = confusion_matrix(self.__y_test, y_test_pred)  # array
			confusion_mat_test = self.__helper_plot_confusion_matrix(confusion_mat_test)
			if self.__testing_mode is False:
				self.__experiment.log_chart("Test Set - confusion matrix", data=Heatmap(z=confusion_mat_test))
			else:
				print(confusion_mat_test)

	def __helper_plot_confusion_matrix(self, confusion_matrix):
		output = []
		for y in range(len(confusion_matrix)):
			for x in range(len(confusion_matrix[y])):
				output.append((x, y, round(float(confusion_matrix[x][y]), SKTrainer.DIGITS_TO_ROUND)))
		return output

	def __plot_roc_curve(self, y_test_pred):
		n_classes = len(self.__labels)
		y_test = self.__y_test.tolist()
		y_test_pred = y_test_pred.tolist()
		if n_classes != 2 or self.__testing_mode is True:
			return
		y_test, y_test_pred = list(y_test), list(y_test_pred)
		FPRs, TPRs, _ = roc_curve(y_test, y_test_pred)
		self.__experiment.log_metric(key='ROC curve', Ys=TPRs.tolist(), Xs=FPRs.tolist())

	def __plot_pandas_analyzer(self):
		data = pd.concat([pd.concat([self.__x_train, self.__x_test], axis=0), pd.concat([self.__y_train, self.__y_test], axis=0)], axis=1)
		if self.__testing_mode is False:
			PandasAnalyzer(data, experiment=self.__experiment)

	def __plot_accuracies_and_errors(self):
		self.__plot_accuracies_and_errors_helper()

		if self.__testing_mode is True:
			print("Model: {model}\n"
				  "train_acc={train_acc}\n"
				  "train_loss={train_loss}\n"
				  "test_acc={test_acc}\n"
				  "test_loss={test_loss}".format(
				model=self.__metrics['model'], train_acc=self.__metrics['train_acc'], train_loss=self.__metrics['train_loss'],
				test_acc=self.__metrics['test_acc'], test_loss=self.__metrics['test_loss']))
			if self.__is_cross_val is True:
				print("Folds: {folds}\n".format(folds=self.__metrics['folds']))

		else:  # testing_mode is False
			self.__experiment.log_param("model", self.__metrics['model'])
			self.__experiment.log_param("test_acc", self.__metrics['test_acc'])
			self.__experiment.log_param("test_loss", self.__metrics['test_loss'])
			if self.__is_cross_val is True:
				self.__experiment.log_param("folds", self.__metrics['folds'])
				self.__experiment.log_metric("train_acc", self.__metrics['train_acc'])
				self.__experiment.log_metric("train_loss", self.__metrics['train_loss'])
				return
			self.__experiment.log_param("train_acc", self.__metrics['train_acc'])
			self.__experiment.log_param("train_loss", self.__metrics['train_loss'])

	def __plot_accuracies_and_errors_helper(self):
		"""Rounds all the values in self.__metrics"""
		keys_to_round = ['train_acc', 'train_loss', 'test_acc', 'test_loss']
		for key in keys_to_round:
			self.__metrics[key] = round(self.__metrics[key], SKTrainer.DIGITS_TO_ROUND)

	def __plot_all(self, y_test_pred):
		"""
		Runs all the plotting methods.
		"""
		self.__plot_pandas_analyzer()
		self.__plot_feature_importance()
		self.__plot_classification_report(y_test_pred=y_test_pred)
		self.__plot_confusion_matrix(y_test_pred=y_test_pred)
		self.__plot_roc_curve(y_test_pred=y_test_pred)
		self.__plot_accuracies_and_errors()
		self.__save_model()

	"""technical methods"""

	def __save_model(self):
		output_file_name = os.environ.get("CNVRG_PROJECT_PATH") + "/" + self.__output_model_name if os.environ.get("CNVRG_PROJECT_PATH") \
																									is not None else self.__output_model_name
		pickle.dump(self.__model, open(output_file_name, 'wb'))
		if not self.__testing_mode:
			os.system("ls -la {}".format(os.environ.get("CNVRG_PROJECT_PATH")))
예제 #5
0
class SKTrainer:
	def __init__(self, model, train_set, test_set, output_model_name, testing_mode, folds=None):
		self.__model = model
		self.__x_train, self.__y_train = train_set
		self.__x_test, self.__y_test = test_set
		self.__all_data_concatenated = pd.concat([pd.concat([self.__x_train, self.__x_test], axis=0),
												  pd.concat([self.__y_train, self.__y_test], axis=0)], axis=1)
		self.__testing_mode = testing_mode
		self.__cross_val_folds = folds
		self.__is_cross_val = (folds is not None)
		self.__features = list(self.__x_train.columns)
		self.__labels = [str(l) for l in list(set(self.__y_train).union(set(self.__y_test)))]
		self.__metrics = {'model': output_model_name, 'train set size': len(self.__y_train), 'test set size': len(self.__y_test)}
		self.__experiment = Experiment()

	def run(self):
		""" runs the training & testing methods. """
		self.__model.fit(self.__x_train.values, self.__y_train.values)

		if self.__is_cross_val: self.__metrics['folds'] = self.__cross_val_folds

		if self.__is_cross_val is True: self.__train_with_cross_validation()
		else: self.__train_without_cross_validation()

		self.__save_model()

	def __plot_all(self, y_test_pred):
		"""
		This method controls the visualization and metrics outputs.
		Hashtag something which you don't want to plot.
		"""
		self.__plot_correlation_matrix()
		# self.__plot_feature_vs_feature()
		self.__plot_feature_importance()
		self.__plot_classification_report(y_test_pred=y_test_pred)
		self.__plot_confusion_matrix(y_test_pred=y_test_pred)
		self.__plot_roc_curve(y_test_pred=y_test_pred)
		self.__plot_accuracies_and_errors()

	"""training & testing methods"""

	def __train_with_cross_validation(self):
		"""
		This method enables sk-learn algorithms to perform KFold-cross-validation.
		The method also initiates the cnvrg experiment with all its metrics.
		"""
		scores = cross_validate(estimator=self.__model,
								X=self.__x_train.values,
								y=self.__y_train.values,
								cv=self.__cross_val_folds,
								return_train_score=True,
								scoring=['neg_mean_squared_error', 'accuracy'],
								return_estimator=True)

		train_acc_cv = scores['train_accuracy']
		train_err_cv = (-1) * scores['train_neg_mean_squared_error']
		val_acc_cv = scores['test_accuracy']
		val_err_cv = (-1) * scores['test_neg_mean_squared_error']
		self.__model = scores['estimator'][-1]

		y_pred = self.__model.predict(self.__x_test.values)
		test_acc = accuracy_score(self.__y_test.values, y_pred)
		test_loss = zero_one_loss(self.__y_test.values, y_pred)
		self.__metrics.update({
			'train_acc': train_acc_cv,
			'train_loss': train_err_cv,
			'train_loss_type': 'MSE',
			'validation_acc': val_acc_cv,
			'validation_loss': val_err_cv,
			'validation_loss_type': 'MSE',
			'test_acc': test_acc,
			'test_loss': test_loss,
			'test_loss_type': 'zero_one_loss'
		})
		self.__plot_all(y_pred)

	def __train_without_cross_validation(self):
		"""
		The method also initiates the cnvrg experiment with all its metrics.
		"""
		y_hat = self.__model.predict(self.__x_train.values)  # y_hat is a.k.a y_pred

		train_acc = accuracy_score(self.__y_train, y_hat)
		train_loss = zero_one_loss(self.__y_train, y_hat)

		y_pred = self.__model.predict(self.__x_test.values)
		test_acc = accuracy_score(self.__y_test, y_pred)
		test_loss = zero_one_loss(self.__y_test, y_pred)
		self.__metrics.update({
			'train_acc': train_acc,
			'train_loss': train_loss,
			'train_loss_type': 'zero_one_loss',
			'test_acc': test_acc,
			'test_loss': test_loss,
			'test_loss_type': 'zero_one_loss'
		})
		self.__plot_all(y_pred)

	def __plot_feature_importance(self):
		try:
			importance = getattr(self.__model, "feature_importances_")
			if self.__testing_mode is False:
				self.__experiment.log_chart('Feature Importance', x_axis='Features', y_axis='Importance', data=Bar(x=self.__features, y=importance))
			else:
				print(importance)
		except AttributeError:
			pass

	def __plot_classification_report(self, y_test_pred):
		test_report = classification_report(self.__y_test, y_test_pred, output_dict=True)  # dict
		if self.__testing_mode is False:
			testing_report_as_array = self.__helper_plot_classification_report(test_report)
			self.__experiment.log_chart("Test Set - Classification Report", data=Heatmap(z=testing_report_as_array), y_ticks=self.__labels, x_ticks=["precision", "recall", "f1-score", "support"])
		else:
			print(test_report)

	def __plot_confusion_matrix(self, y_test_pred=None):
		""" Plots the confusion matrix. """
		if self.__y_test is not None and y_test_pred is not None:
			confusion_mat_test = confusion_matrix(self.__y_test, y_test_pred)  # array
			confusion_mat_test = SKTrainer.__helper_plot_confusion_matrix(confusion_mat_test)
			if self.__testing_mode is False:
				self.__experiment.log_chart("Test Set - confusion matrix", data=Heatmap(z=confusion_mat_test))
			else:
				print(confusion_mat_test)

	def __plot_roc_curve(self, y_test_pred):
		if len(set(self.__y_test)) != 2: return
		fpr, tpr, _ = roc_curve(self.__y_test, y_test_pred)
		if self.__testing_mode is False:
			self.__experiment.log_metric(key='ROC curve', Ys=tpr.tolist(), Xs=fpr.tolist())
		else: print("FPRs: {fpr}\nTPRs: {tpr}".format(fpr=fpr, tpr=tpr))

	def __plot_correlation_matrix(self):
		data = self.__all_data_concatenated
		correlation = data.corr()
		self.__experiment.log_chart("correlation", [MatrixHeatmap(np.round(correlation.values, 2))],
									x_ticks=correlation.index.tolist(), y_ticks=correlation.index.tolist())

	def __plot_feature_vs_feature(self):
		data = self.__all_data_concatenated
		indexes = data.select_dtypes(include=["number"]).columns
		corr = data.corr()
		for idx, i in enumerate(indexes):
			for jdx, j in enumerate(indexes):
				if i == j: continue
				if jdx < idx: continue
				corr_val = abs(corr[i][j])
				if 1 == corr_val or corr_val < 0.5: continue
				droplines = data[[i, j]].notnull().all(1)
				x, y = data[droplines][[i, j]].values.transpose()
				self.__experiment.log_chart("{i}_against_{j}".format(i=i, j=j),
											[Scatterplot(x=x.tolist(), y=y.tolist())],
											title="{i} against {j}".format(i=i, j=j))

	def __plot_accuracies_and_errors(self):
		self.__plot_accuracies_and_errors_helper_rounding()
		if self.__testing_mode is True: self.__plot_accuracies_and_errors_helper_testing_mode()

		for p in ['model', 'test_acc', 'test_loss', 'test_loss_type', 'train set size', 'test set size', 'train_loss_type']:
			self.__experiment.log_param(p, self.__metrics[p])

		if self.__is_cross_val is True:
			self.__experiment.log_param("folds", self.__metrics['folds'])
			self.__experiment.log_param("validation_loss_type", self.__metrics['validation_loss_type'])
			metrics = ['train_acc', 'train_loss', 'validation_acc', 'validation_loss']
			for m in metrics: self.__experiment.log_metric(m, self.__metrics[m], grouping=[m] * len(self.__metrics[m]))
			return

		self.__experiment.log_param("train_acc", self.__metrics['train_acc'])
		self.__experiment.log_param("train_loss", self.__metrics['train_loss'])
		self.__experiment.log_param("train_loss_type", self.__metrics['train_loss_type'])

	def __save_model(self):
		output_model_name = self.__metrics['model']
		output_file_name = os.environ.get("CNVRG_WORKDIR") + "/" + output_model_name if os.environ.get("CNVRG_WORKDIR") is not None else output_model_name
		pickle.dump(self.__model, open(output_file_name, 'wb'))

	""" --- Helpers --- """

	@staticmethod
	def __helper_plot_confusion_matrix(confusion_matrix, digits_to_round=3):
		output = []
		for y in range(len(confusion_matrix)):
			for x in range(len(confusion_matrix[y])):
				output.append((x, y, round(float(confusion_matrix[x][y]), digits_to_round)))
		return output

	def __plot_accuracies_and_errors_helper_rounding(self, digits_to_round=3):
		for key in self.__metrics.keys():
				# Skip strings.
				if isinstance(self.__metrics[key], str):
					continue
				# Lists & Arrays.
				elif isinstance(self.__metrics[key], list) or isinstance(self.__metrics[key], np.ndarray):
					if isinstance(self.__metrics[key], np.ndarray): self.__metrics[key] = self.__metrics[key].tolist()
					for ind in range(len(self.__metrics[key])):
						self.__metrics[key][ind] = round(self.__metrics[key][ind], digits_to_round)
				# int & floats.
				else:
					self.__metrics[key] = round(self.__metrics[key], digits_to_round)

	def __plot_accuracies_and_errors_helper_testing_mode(self, digits_to_round=3):
		print("Model: {model}\n"
			  "train_acc={train_acc}\n"
			  "train_loss={train_loss}\n"
			  "test_acc={test_acc}\n"
			  "test_loss={test_loss}".format(
			model=self.__metrics['model'], train_acc=self.__metrics['train_acc'], train_loss=self.__metrics['train_loss'],
			test_acc=self.__metrics['test_acc'], test_loss=self.__metrics['test_loss']))
		if self.__is_cross_val is True:
			print("Folds: {folds}\n".format(folds=self.__metrics['folds']))

	def __helper_plot_classification_report(self, classification_report_dict, digits_to_round=3):
		""" Converts dictionary given by classification_report to list of lists. """
		rows = []
		for k, v in classification_report_dict.items():
			if k in self.__labels:
				rows.append(list(v.values()))
		values = []
		for y in range(len(rows)):
			for x in range(len(rows[y])):
				values.append((x, y, round(rows[y][x], digits_to_round)))
		return values
예제 #6
0
class CSVProcessor:
    def __init__(self,
                 path_to_csv,
                 target_column=None,
                 missing_dict=None,
                 scale_dict=None,
                 normalize_list=None,
                 one_hot_list=None,
                 output_name=None,
                 plot_vis=False):
        """
		:param path_to_csv: string
		:param target_column: string
		:param missing_dict: dict
		:param scale_dict: dict
		:param normalize_list: list
		:param one_hot_list: list
		:param output_name: string
		"""
        self.__cnvrg_env = True  ### When testing locally, it is turned False.
        self.__data = pd.read_csv(path_to_csv, index_col=0)
        self.__target_column = (
            target_column,
            self.__data[target_column]) if target_column is not None else (
                self.__data.columns[-1], self.__data[self.__data.columns[-1]])
        self.__features = [
            f for f in list(self.__data.columns)
            if f != self.__target_column[0]
        ]
        self.__data = self.__data[
            self.__features]  #  removes the target column.
        try:
            self.__experiment = Experiment()
        except cnvrg.modules.errors.UserError:
            self.__cnvrg_env = False

        self.__normalize_list = CSVProcessor.__parse_list(
            normalize_list) if isinstance(normalize_list,
                                          str) else normalize_list
        self.__one_hot_list = CSVProcessor.__parse_list(
            one_hot_list) if isinstance(one_hot_list, str) else one_hot_list
        self.__output_name = output_name if output_name is not None else path_to_csv.split(
            '.csv')[0] + '_processed.csv'
        self.__plot_vis = plot_vis

        ### changed to list of lists instead of dictionary:
        self.__scale_dict = CSVProcessor.__parse_2d_list(
            scale_dict) if isinstance(scale_dict, str) else scale_dict
        self.__missing_dict = CSVProcessor.__parse_2d_list(
            missing_dict) if isinstance(missing_dict, str) else missing_dict

    def run(self):
        self.__handle_missing()
        self.__one_hot_encoding_aka_dummy()
        self.__scale()
        self.__normalize()
        self.__set_target_column()
        self.__save()
        if self.__cnvrg_env:
            self.__plot_metrics()  ### using cnvrg.
            self.__plot_visualization(plot_correlation=True)  ### using cnvrg.
        self.__check_nulls_before_output()

    def __scale(self):
        scale = lambda m, r_min, r_max, t_min, t_max: ((
            (m - r_min) / (r_max - r_min)) * (t_max - t_min)) + t_min

        if self.__scale_dict is not None:
            scale_all = False
            if set(self.__scale_dict.keys()) == set('all'): scale_all = True
            columns_to_scale = self.__features if scale_all is True else self.__scale_dict.keys(
            )
            for col in columns_to_scale:
                y, x = (self.__data[col].min(), self.__data[col].max()
                        ) if scale_all else CSVProcessor.__scale_helper(
                            self.__scale_dict[col])
                self.__data[col] = scale(self.__data[col],
                                         self.__data[col].min(),
                                         self.__data[col].max(), y, x)

    def __normalize(self):
        if self.__normalize_list is not None:
            normalize_all = False
            if set(self.__normalize_list) == set('all'): normalize_all = True

            columns_to_scale = self.__features if normalize_all is True else self.__normalize_list
            for col in columns_to_scale:
                min_range, max_range = self.__data[col].min(
                ), self.__data[col].max()
                self.__data[col] -= min_range
                self.__data[col] /= (max_range - min_range)

    def __one_hot_encoding_aka_dummy(self):
        """
		Handles dummys.
		"""
        if self.__one_hot_list is not None:
            self.__data = pd.get_dummies(self.__data,
                                         columns=self.__one_hot_list)

    def __handle_missing(self):
        """
		Options:
		1) fill_X (fill with value x)
		2) drop
		3) avg (fill with avg)
		4) med (short of median)
		5) rand_A_B (fill with random value in range [A,B]
		"""
        if self.__missing_dict is not None:
            handle_all, task_all = False, None
            if set(self.__missing_dict.keys()) == set('all'):
                handle_all, task_all = True, self.__missing_dict['all']
            column_to_handle = self.__features if handle_all is True else self.__missing_dict.keys(
            )

            for col in column_to_handle:
                task = task_all if task_all is not None else self.__missing_dict[
                    col]
                if task.startswith('fill_'):
                    value = float(task[len('fill_'):]
                                  ) if '.' in task[len('fill_'):] else int(
                                      task[len('fill_'):])
                    self.__data[col] = self.__data[col].fillna(value)
                elif task.startswith('drop'):
                    self.__data = self.__data[self.__data[col].notna()]
                elif task.startswith('avg'):
                    self.__data[col] = self.__data[col].fillna(
                        self.__data[col].mean())
                elif task.startswith('med'):
                    self.__data[col] = self.__data[col].fillna(
                        self.__data[col].median())
                elif task.startswith('randint_'):
                    a, b = task[len('randint_'):].split('_')
                    a, b = float(a) if '.' in a else int(a), float(
                        b) if '.' in b else int(b)
                    self.__data[col] = self.__data[col].fillna(
                        np.random.randint(a, b))
                else:
                    raise ValueError(
                        'Missing Values Handling - Undefined task.')

    def __set_target_column(self):
        self.__data[self.__target_column[0]] = self.__target_column[1]

    def __plot_metrics(self):
        self.__experiment.log_param("output_file", self.__output_name)

    def __plot_visualization(self, plot_correlation=True):
        if self.__plot_vis is False: return

        # Tasks:
        if plot_correlation: self.__plot_correlation_matrix()

    def __save(self):
        self.__data.to_csv(self.__output_name)

    def __check_nulls_before_output(self):
        # Check empty and nan values to warn the user.
        time.sleep(8)
        nulls_report = dict(self.__data.isnull().sum())
        features_with_null_values = [
            k for k, v in nulls_report.items() if v != 0
        ]
        # if len(features_with_null_values) != 0:
        # 	warnings.warn("Null values or empty cells in the data set.", UserWarning)
        return

    """ ------------------- """
    """ ----- Helpers ----- """
    """ ------------------- """

    @staticmethod
    def __parse_2d_list(as_string):
        final_dict = {}
        trimmed = as_string.replace(' ', '')
        commans_idxs = [0] + [
            i for i in range(1, len(trimmed)) if trimmed[i] == ','
            and trimmed[i - 1] == ']' and trimmed[i + 1] == '['
        ] + [len(trimmed) - 1]  ### if its 0, we have single array.
        sub_lists = [
            trimmed[commans_idxs[i - 1] + 1:commans_idxs[i]]
            for i in range(1, len(commans_idxs))
        ] if len(commans_idxs) > 2 else [trimmed[1:-1]]

        for sub_list in sub_lists:
            parsed = CSVProcessor.__parse_list(sub_list)
            try:
                final_dict[parsed[0]] = (parsed[1], parsed[2]
                                         )  ### for scaling.
            except IndexError:
                final_dict[parsed[0]] = parsed[
                    1]  ### for filling empty values.

        return final_dict

    @staticmethod
    def __parse_list(list_as_string):
        if list_as_string == '[]': return []

        list_without_parenthesis = list_as_string.strip()[1:-1]
        parsed_list = [
            st.strip() for st in list_without_parenthesis.split(',')
        ]

        # Check if the values are columns numbers.
        try:
            parsed_list = [int(st) for st in parsed_list]
        except ValueError:
            pass

        return parsed_list

    @staticmethod
    def __parse_dict(dict_as_string):
        if dict_as_string == '{}': return {}
        final_key = dict()
        parsed_dict = eval(dict_as_string)
        if not isinstance(parsed_dict, dict):
            raise TypeError('Given a {} instead of dictionary.'.format(
                type(parsed_dict)))
        all_keys = parsed_dict.keys()
        for k in all_keys:
            true_key, true_value = k, parsed_dict[k].split(':')
            true_key = true_key.strip()
            final_key[true_key] = true_value
        return final_key

    @staticmethod
    def __scale_helper(value):
        min_val, max_val = value.split(':') if isinstance(
            value, str) else value[0], value[1]
        min_val = float(min_val) if '.' in min_val else int(min_val)
        max_val = float(max_val) if '.' in max_val else int(max_val)
        return min_val, max_val

    def __plot_correlation_matrix(self, digits_to_round=3):
        correlation = self.__data.corr()
        self.__experiment.log_chart(
            "Correlation",
            [MatrixHeatmap(np.round(correlation.values, digits_to_round))],
            x_ticks=correlation.index.tolist(),
            y_ticks=correlation.index.tolist())