def __init__(self, arguments, model_name, base_model): self.__cnvrg_env = True self.__arguments = cast_input_types(arguments) self.__shape = (arguments.image_height, arguments.image_width) self.__classes = parse_classes(arguments.data) self.__channels = TensorflowTrainer.RGB_CHANNELS if arguments.image_color == 'rgb' \ else TensorflowTrainer.GRAYSCALE_CHANNELS self.__model = ModelGenerator( base_model=base_model, num_of_classes=len(self.__classes), fully_connected_layers=TensorflowTrainer.fully_connected_layers, loss_function=arguments.loss, dropout=arguments.dropout, activation_hidden_layers=arguments.hidden_layer_activation, activation_output_layers=arguments.output_layer_activation, optimizer=arguments.optimizer).get_model() try: self.__experiment = Experiment() except cnvrg.modules.UserError: self.__cnvrg_env = False self.__metrics = { 'tensorflow local version': tf.__version__, 'GPUs found': len(tf.config.experimental.list_physical_devices('GPU')), 'Model': model_name, 'Classes list': self.__classes }
def log_trial_result(self, iteration, trial, result): e = CNVRGExperiment(self._cnvrg_experiments[trial.trial_id]) e.log(str(result)) if self._cnvrg_metrics == []: self._cnvrg_metrics = [key for key in result] training_iteration = result['training_iteration'] for key in self._cnvrg_metrics: try: value = float(result[key]) except (ValueError, TypeError): continue e.log_metric(key, value, training_iteration)
def __init__(self, model, train_set, test_set, output_model_name, testing_mode, folds=None): self.__model = model self.__x_train, self.__y_train = train_set self.__x_test, self.__y_test = test_set self.__all_data_concatenated = pd.concat([pd.concat([self.__x_train, self.__x_test], axis=0), pd.concat([self.__y_train, self.__y_test], axis=0)], axis=1) self.__testing_mode = testing_mode self.__cross_val_folds = folds self.__is_cross_val = (folds is not None) self.__features = list(self.__x_train.columns) self.__labels = [str(l) for l in list(set(self.__y_train).union(set(self.__y_test)))] self.__metrics = {'model': output_model_name, 'train set size': len(self.__y_train), 'test set size': len(self.__y_test)} self.__experiment = Experiment()
def __init__(self, path_to_csv, target_column=None, missing_dict=None, scale_dict=None, normalize_list=None, one_hot_list=None, output_name=None, plot_vis=False): """ :param path_to_csv: string :param target_column: string :param missing_dict: dict :param scale_dict: dict :param normalize_list: list :param one_hot_list: list :param output_name: string """ self.__cnvrg_env = True ### When testing locally, it is turned False. self.__data = pd.read_csv(path_to_csv, index_col=0) self.__target_column = ( target_column, self.__data[target_column]) if target_column is not None else ( self.__data.columns[-1], self.__data[self.__data.columns[-1]]) self.__features = [ f for f in list(self.__data.columns) if f != self.__target_column[0] ] self.__data = self.__data[ self.__features] # removes the target column. try: self.__experiment = Experiment() except cnvrg.modules.errors.UserError: self.__cnvrg_env = False self.__normalize_list = CSVProcessor.__parse_list( normalize_list) if isinstance(normalize_list, str) else normalize_list self.__one_hot_list = CSVProcessor.__parse_list( one_hot_list) if isinstance(one_hot_list, str) else one_hot_list self.__output_name = output_name if output_name is not None else path_to_csv.split( '.csv')[0] + '_processed.csv' self.__plot_vis = plot_vis ### changed to list of lists instead of dictionary: self.__scale_dict = CSVProcessor.__parse_2d_list( scale_dict) if isinstance(scale_dict, str) else scale_dict self.__missing_dict = CSVProcessor.__parse_2d_list( missing_dict) if isinstance(missing_dict, str) else missing_dict
def __init__(self, input, to, template, inplace, allow_errors): self.__cnvrg_env = True # When testing locally, it is turned False. self.input = input self.to = to self.template = template self.inplace = inplace self.allow_errors = allow_errors try: self.__experiment = Experiment() except: self.__cnvrg_env = False if self.__cnvrg_env: self.__experiment.log_param("template", template)
def lgbm_reg_cnvrg_api(experiment, artifacts_path, metrics): global experiment_file_path # type handling when saving json (numpy types) def default(o): if isinstance(o, np.int) or isinstance(o, np.int16) or isinstance( o, np.int32) or isinstance(o, np.int64): return int(o) if isinstance(o, np.float) or isinstance(o, np.float16) or isinstance(o, np.float32) or \ isinstance(o, np.float64): return float(o) raise TypeError experiment_ix = experiment.get('ix') hyperparams_dumped = json.dumps(experiment.get('hyperparams'), default=default) metrics_dumped = json.dumps(metrics, default=default) cmd = "python3 {}".format(experiment_file_path) # os.system(cmd) e = Experiment.run(cmd, title='lgbm_reg_experiment-{}'.format( experiment.get('ix')), arguments={ 'experiment_ix': experiment_ix, 'hyperparams': "'{}'".format(hyperparams_dumped), 'artifacts_path': artifacts_path, 'metrics': "'{}'".format(metrics_dumped) }, compute='medium', output_dir='research/artifacts', sync_before=False)
def __init__(self, model, train_set, test_set, output_model_name, testing_mode, folds=None, regression_type=0): self.__model = model self.__x_train, self.__y_train = train_set self.__train_set_size = len(self.__y_train) self.__x_test, self.__y_test = test_set self.__test_set_size = len(self.__y_test) self.__testing_mode = testing_mode self.__cross_val_folds = folds self.__is_cross_val = (folds is not None) self.__features = list(self.__x_train.columns) self.__labels = [ str(l) for l in list(set(self.__y_train).union(set(self.__y_test))) ] self.__metrics = {'model': output_model_name} self.__y_pred = None self.__experiment = Experiment.init( 'test_charts') # replace with: self.__experiment = Experiment() self.__regression_type = SKTrainerRegression.REGRESSION_TYPE[ regression_type] self.__coef, self.__intercept = None, None
def __init__(self, model, train_set, test_set, output_model_name, testing_mode, folds=None): self.__model = model self.__x_train, self.__y_train = train_set self.__x_test, self.__y_test = test_set self.__output_model_name = output_model_name self.__testing_mode = testing_mode self.__cross_val_folds = folds self.__is_cross_val = (folds is not None) self.__features = list(self.__x_train.columns) self.__labels = [str(l) for l in list(set(self.__y_train).union(set(self.__y_test)))] self.__model.fit(self.__x_train, self.__y_train) self.__importance = self.__model.feature_importances_ self.__experiment = Experiment() self.__metrics = {'model': self.__output_model_name} if self.__is_cross_val: self.__metrics['folds'] = self.__cross_val_folds
def __init__(self, model, train_set, test_set, output_model_name, testing_mode): self.__model = model self.__x_train, _ = (train_set, None) if len(train_set) == 1 else train_set self.__train_set_size = len(self.__x_train) self.__x_test, self.__y_test = (test_set, None) if len(train_set) == 1 else train_set self.__test_set_size = len(self.__x_test) self.__testing_mode = testing_mode self.__features = list(self.__x_train.columns) self.__metrics = {'model': output_model_name} self.__labeled = len(train_set) == 2 or len(test_set) == 2 # if any of the sets includes target column. # self.__experiment = Experiment() self.__experiment = Experiment.init("test_charts")
def log_trial_start(self, trial): e = CNVRGExperiment.init() self._cnvrg_experiments[trial.trial_id] = e['slug'] config = trial.config.copy() config.pop("callbacks", None) e.log_param("trial_id", trial.trial_id) e.log_param("run_id", trial.trial_id.split("_")[0]) e.log(str(config)) for item in config: e.log_param(item, config.get(item)) e.log("======") e.log(str(trial))
def main(args): args = cast_types(args) df = pd.read_csv(args.data) X, y = df.iloc[:, :-1], df.iloc[:, -1] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) model = init_model(X.shape) # <--- Doesn't work with the shape. train_metrics = model.fit(X_train, y_train, epochs=args.epochs, batch_size=args.batch_size, validation_split=0.2) test_metrics = model.evaluate(X_test, y_test) # train_loss = list(np.round(train_metrics.history['loss'], 3)) # train_acc = list(np.round(train_metrics.history['accuracy'], 3)) # val_loss = list(np.round(train_metrics.history['val_loss'], 3)) # val_acc = list(np.round(train_metrics.history['val_accuracy'], 3)) test_loss = float(test_metrics[0]) test_acc = float(test_metrics[1]) exp = Experiment() exp.log_param("test_loss", test_loss) exp.log_param("test_acc", test_acc) model.save("model.h5")
def log_trial_end(self, trial, failed): e = CNVRGExperiment(self._cnvrg_experiments[trial.trial_id]) e.log("===== Logging Artifacts =====") from os import listdir files_list = [ os.path.join(trial.logdir, p) for p in os.listdir(trial.logdir) ] e.log_artifacts(files_list) e.finish(exit_status=int(failed))
def lgbm_reg_cnvrg_api(experiment, artifacts_path, metrics): # type handling when saving json (numpy types) def default(o): if isinstance(o, np.int) or isinstance(o, np.int16) or isinstance( o, np.int32) or isinstance(o, np.int64): return int(o) if isinstance(o, np.float) or isinstance(o, np.float16) or isinstance(o, np.float32) or \ isinstance(o, np.float64): return float(o) raise TypeError experiment_ix = experiment.get('ix') hyperparams_dumped = json.dumps(experiment.get('hyperparams'), default=default) metrics_dumped = json.dumps(metrics, default=default) # cmd = "python3 research/lgbm_reg/train.py --experiment '{}' --artifacts_path '{}' --metrics '{}'".format(experiment_dumped, # artifacts_path, # metrics_dumped) cmd = "python3 research/lgbm_reg/train.py" # os.system(cmd) e = Experiment.run(cmd, title='lgbm_reg_experiment-{}'.format( experiment.get('ix')), arguments={ 'experiment_ix': experiment_ix, 'hyperparams': "'{}'".format(hyperparams_dumped), 'artifacts_path': artifacts_path, 'metrics': "'{}'".format(metrics_dumped) }, compute='medium', output_dir='research/artifacts', sync_before=False) e.pull_artifacts(wait_until_success=True)
def train_with_cross_validation(model, train_set, test_set, folds, project_dir, output_model_name): """ This method enables sklearn algorithms to perform KFold-cross-validation. The method also initates the cnvrg.io experiment with all its metrics. :param model: SKlearn model object (initiated). :param train_set: tuple. (X_train, y_train). This is going to be used as a training set. :param test_set: tuple. (X_test, y_test). This is going to be used as a test set. :param folds: number of splits in the cross validation. :param project_dir: the path to the directory which indicates where to save the model. :param output_model_name: the name of the output model saved on the disk. :return: nothing. """ train_acc, train_loss = [], [] kf = KFold(n_splits=folds) X, y = train_set # --- Training. for train_index, val_index in kf.split(X): X_train, X_val = X.iloc[train_index, :], X.iloc[val_index, :] y_train, y_val = y.iloc[train_index], y.iloc[val_index] model.fit(X_train, y_train) model.n_estimators += 1 y_hat = model.predict(X_val) # y_hat is a.k.a y_pred acc = accuracy_score(y_val, y_hat) loss = mean_squared_error(y_val, y_hat) train_acc.append(acc) train_loss.append(loss) # --- Testing. X_test, y_test = test_set y_pred = model.predict(X_test) test_acc = accuracy_score(y_test, y_pred) test_loss = mean_squared_error(y_test, y_pred) exp = Experiment() exp.log_param("model", output_model_name) exp.log_param("folds", folds) exp.log_metric("train_acc", train_acc) exp.log_metric("train_loss", train_loss) exp.log_param("test_acc", test_acc) exp.log_param("test_loss", test_loss) # Save model. output_file_name = project_dir + "/" + output_model_name if project_dir is not None else output_model_name pickle.dump(model, open(output_file_name, 'wb'))
def train_without_cross_validation(model, train_set, test_set, project_dir, output_model_name): """ The method also initates the cnvrg.io experiment with all its metrics. :param model: SKlearn model object (initiated). :param train_set: tuple. (X_train, y_train). This is going to be used as a training set. :param test_set: tuple. (X_test, y_test). This is going to be used as a test set. :param project_dir: the path to the directory which indicates where to save the model. :param output_model_name: the name of the output model saved on the disk. :return: nothing. """ X_train, y_train = train_set # --- Training. model.fit(X_train, y_train) y_hat = model.predict(X_train) # y_hat is a.k.a y_pred train_acc = accuracy_score(y_train, y_hat) train_loss = mean_squared_error(y_train, y_hat) # --- Testing. X_test, y_test = test_set y_pred = model.predict(X_test) test_acc = accuracy_score(y_test, y_pred) test_loss = mean_squared_error(y_test, y_pred) exp = Experiment() exp.log_param("model", output_model_name) exp.log_param("train_acc", train_acc) exp.log_param("train_loss", train_loss) exp.log_param("test_acc", test_acc) exp.log_param("test_loss", test_loss) # Save model. output_file_name = project_dir + "/" + output_model_name if project_dir is not None else output_model_name pickle.dump(model, open(output_file_name, 'wb'))
import os import time from cnvrg import Experiment os.system("mkdir -p testfiles") e = Experiment() for commit in range(5): file_list = [] for i in range(25): with open(f"testfiles/filet{i}.txt", 'w+') as file: file.write('hello') with open(f"testfiles/filet{i}.txt_tags.yml", 'w+') as yml: yml.write(f"---\nid: \"{i}\"\nsource: \"yann lecun\"") file_list.append(f"testfiles/filet{i}.txt") file_list.append(f"testfiles/filet{i}.txt_tags.yml") e.log_artifacts(file_list) time.sleep(5) print(f"commited: {commit}")
def lgbm_reg(experiment, artifacts_path, metrics): e = Experiment() [e.log_param(param, val) for param, val in experiment.get('hyperparams').items()] # init data = load_data(data_path) cv_data = create_cv_data(data['X_train'], data['y_train'], cv_config=cv_config) # hyperparams model = create_model(experiment['hyperparams']) # scores dict scores = {'raw_cv_scores': {}, 'cv_scores': {}, 'test_scores': {}} # cv for task in cv_data: X_train, y_train, X_test, y_test = task['X_train'], task['y_train'], task['X_test'], task['y_test'] X_train, y_train, X_test, y_test = X_train[features_names], y_train[target_name], \ X_test[features_names], y_test[target_name] model.fit(X_train, y_train) predictions_test = model.predict(X_test) predictions_train = model.predict(X_train) test_data_to_evaluate = (predictions_test, y_test) train_data_to_evaluate = (predictions_train, y_train) scores_train = evaluate(*train_data_to_evaluate, metrics=metrics, data_set_name='train_') scores_test = evaluate(*test_data_to_evaluate, metrics=metrics) task_scores = {**scores_test, **scores_train} for score in task_scores.keys(): if scores['raw_cv_scores'].get(score) is None: scores['raw_cv_scores'][score] = [] scores['raw_cv_scores'][score].append(task_scores[score]) # process cv scores summarized_cv_scores = summarize_scores(scores['raw_cv_scores']) scores['cv_scores'].update(summarized_cv_scores) scores.update(summarized_cv_scores) # final model X_train, y_train, X_test, y_test = data['X_train'], data['y_train'], \ data['X_test'], data['y_test'] final_model = model.fit(X_train[features_names], y_train[target_name]) predictions_test = model.predict(X_test[features_names]) predictions_train = model.predict(X_train[features_names]) test_data_to_evaluate = (predictions_test, y_test[target_name]) train_data_to_evaluate = (predictions_train, y_train[target_name]) scores_train = evaluate(*train_data_to_evaluate, metrics=metrics, data_set_name='train_') scores_test = evaluate(*test_data_to_evaluate, metrics=metrics) scores['test_scores'] = {**scores_test, **scores_train} experiment['scores'] = scores save_model(final_model, artifacts_path) save_model_scores(experiment, artifacts_path)
class SKTrainerRegression: DIGITS_TO_ROUND = 3 REGRESSION_TYPE = ['linear', 'logistic'] def __init__(self, model, train_set, test_set, output_model_name, testing_mode, folds=None, regression_type=0): self.__model = model self.__x_train, self.__y_train = train_set self.__train_set_size = len(self.__y_train) self.__x_test, self.__y_test = test_set self.__test_set_size = len(self.__y_test) self.__testing_mode = testing_mode self.__cross_val_folds = folds self.__is_cross_val = (folds is not None) self.__features = list(self.__x_train.columns) self.__labels = [str(l) for l in list(set(self.__y_train).union(set(self.__y_test)))] self.__metrics = {'model': output_model_name} self.__y_pred = None self.__experiment = Experiment() self.__regression_type = SKTrainerRegression.REGRESSION_TYPE[regression_type] self.__coef, self.__intercept = None, None def run(self): self.__model.fit(self.__x_train, self.__y_train) try: self.__coef = self.__model.coef_ except AttributeError: pass try: self.__intercept = self.__model.intercept_ except AttributeError: pass if self.__is_cross_val: self.__metrics['folds'] = self.__cross_val_folds if self.__is_cross_val is True: self.__train_with_cross_validation() else: self.__train_without_cross_validation() self.__save_model() def __plot_all(self, y_test_pred): self.__plot_accuracies_and_errors() # self.__plot_regression_function() self.__plot_feature_importance() self.__plot_correlation_matrix() # self.__plot_feature_vs_feature() def __train_with_cross_validation(self): """ This method enables sk-learn algorithms to perform KFold-cross-validation. The method also initiates the cnvrg experiment with all its metrics. """ scores = cross_validate(estimator=self.__model, X=self.__x_train, y=self.__y_train, cv=self.__cross_val_folds, return_train_score=True, scoring=['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2', 'accuracy'], return_estimator=True) train_err_cv_mse = (-1) * scores['train_neg_mean_squared_error'] train_err_cv_mae = (-1) * scores['train_neg_mean_absolute_error'] train_err_cv_r2 = scores['train_r2'] val_acc_cv = scores['test_accuracy'] val_err_cv_mse = (-1) * scores['test_neg_mean_squared_error'] val_err_cv_mae = (-1) * scores['test_neg_mean_absolute_error'] val_err_cv_r2 = scores['test_r2'] self.__model = scores['estimator'][-1] self.__y_pred = self.__model.predict(self.__x_test) test_acc = accuracy_score(self.__y_test, self.__y_pred) test_loss = mean_squared_error(self.__y_test, self.__y_pred) self.__metrics.update({ 'train_loss_mae': train_err_cv_mae, 'train_loss_mse': train_err_cv_mse, 'train_loss_r2': train_err_cv_r2, 'validation_acc': val_acc_cv, 'val_loss_mae': val_err_cv_mae, 'val_loss_mse': val_err_cv_mse, 'val_loss_r2': val_err_cv_r2, 'test_acc': test_acc, 'test_loss_mse': test_loss}) self.__plot_all(self.__y_pred) def __train_without_cross_validation(self): """ The method also initiates the cnvrg experiment with all its metrics. """ y_hat = self.__model.predict(self.__x_train) # y_hat is a.k.a y_pred train_loss_MSE = mean_squared_error(self.__y_train, y_hat) train_loss_MAE = mean_absolute_error(self.__y_train, y_hat) train_loss_R2 = r2_score(self.__y_train, y_hat) self.__y_pred = self.__model.predict(self.__x_test) test_loss_MSE = mean_squared_error(self.__y_test, self.__y_pred) test_loss_MAE = mean_absolute_error(self.__y_test, self.__y_pred) test_loss_R2 = r2_score(self.__y_test, self.__y_pred) self.__metrics.update({ 'train_loss_mae': train_loss_MAE, 'train_loss_mse': train_loss_MSE, 'train_loss_r2': train_loss_R2, 'test_loss_mse': test_loss_MSE, 'test_loss_mae': test_loss_MAE, 'test_loss_r2': test_loss_R2}) self.__plot_all(self.__y_pred) def __plot_regression_function(self): if self.__regression_type == 'linear': a, b = self.__coef[0], self.__intercept x = np.linspace(-100, 100, 200) y = a * x + b elif self.__regression_type == 'logistic': x = np.linspace(-100, 100, 200) y = 1 / (1 + np.exp(-x)) self.__experiment.log_metric(key="Regression Function", Xs=x.tolist(), Ys=y.tolist(), grouping=['regression line'] * len(x)) def __plot_feature_importance(self): try: importance = getattr(self.__model, "feature_importances_") if self.__testing_mode is False: self.__experiment.log_chart('Feature Importance', x_axis='Features', y_axis='Importance', data=Bar(x=self.__features, y=importance)) else: print(importance) except AttributeError: pass def __plot_accuracies_and_errors(self): if self.__testing_mode is True: print("Model: {model}\n" "train_acc={train_acc}\n" "train_loss={train_loss}\n" "test_acc={test_acc}\n" "test_loss={test_loss}".format( model=self.__metrics['model'], train_acc=self.__metrics['train_acc'], train_loss=self.__metrics['train_loss'], test_acc=self.__metrics['test_acc'], test_loss=self.__metrics['test_loss'])) if self.__is_cross_val is True: print("Folds: {folds}\n".format(folds=self.__metrics['folds'])) else: # testing mode is off. for k, v in self.__metrics.items(): self.__plot_accuracies_and_errors_helper() if isinstance(v, list): self.__experiment.log_metric(k, v) else: self.__experiment.log_param(k, v) def __plot_accuracies_and_errors_helper(self): for k, v in self.__metrics.items(): if isinstance(v, float): self.__metrics[k] = round(self.__metrics[k], SKTrainerRegression.DIGITS_TO_ROUND) def __save_model(self): output_model_name = self.__metrics['model'] output_file_name = os.environ.get("CNVRG_WORKDIR") + "/" + output_model_name if os.environ.get("CNVRG_WORKDIR") \ is not None else output_model_name pickle.dump(self.__model, open(output_file_name, 'wb')) """training & testing methods""" def __plot_correlation_matrix(self): data = pd.concat([pd.concat([self.__x_train, self.__x_test], axis=0), pd.concat([self.__y_train, self.__y_test], axis=0)], axis=1) correlation = data.corr() self.__experiment.log_chart("correlation", [MatrixHeatmap(np.round(correlation.values, 2))], x_ticks=correlation.index.tolist(), y_ticks=correlation.index.tolist()) def __plot_feature_vs_feature(self): data = pd.concat([pd.concat([self.__x_train, self.__x_test], axis=0), pd.concat([self.__y_train, self.__y_test], axis=0)], axis=1) indexes = data.select_dtypes(include=["number"]).columns corr = data.corr() for idx, i in enumerate(indexes): for jdx, j in enumerate(indexes): if i == j: continue if jdx < idx: continue corr_val = abs(corr[i][j]) if 1 == corr_val or corr_val < 0.5: continue print("create", i, "against", j, "scatter chart") droplines = data[[i, j]].notnull().all(1) x, y = data[droplines][[i, j]].values.transpose() self.__experiment.log_chart("{i}_against_{j}".format(i=i, j=j), [Scatterplot(x=x.tolist(), y=y.tolist())], title="{i} against {j}".format(i=i, j=j))
{\rtf1\ansi\ansicpg1252\cocoartf2577 \cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fnil\fcharset0 Monaco;} {\colortbl;\red255\green255\blue255;\red199\green200\blue201;\red22\green21\blue22;} {\*\expandedcolortbl;;\cssrgb\c81961\c82353\c82745;\cssrgb\c11373\c10980\c11373\c3922;} \paperw11900\paperh16840\margl1440\margr1440\vieww11520\viewh8400\viewkind0 \deftab720 \pard\pardeftab720\sl360\partightenfactor0 \f0\fs24 \cf2 \cb3 \expnd0\expndtw0\kerning0 \outl0\strokewidth0 \strokec2 import os\cb1 \ \cb3 import time\cb1 \ \cb3 from cnvrg import Experiment\cb1 \ \cb3 os.system("mkdir -p testfiles")\cb1 \ \cb3 e = Experiment()\cb1 \ \cb3 for commit in range(5):\cb1 \ \cb3 file_list = []\cb1 \ \cb3 for i in range(25):\cb1 \ \cb3 with open(f"testfiles/filet\{i\}.txt", 'w+') as file:\cb1 \ \cb3 file.write('hello')\cb1 \ \cb3 with open(f"testfiles/filet\{i\}.txt_tags.yml", 'w+') as yml:\cb1 \ \cb3 yml.write(f"---\\nid: \\"\{i\}\\"\\nsource: \\"yann lecun\\"")\ \pard\pardeftab720\sl360\partightenfactor0 \cf2 \cb1 \ \pard\pardeftab720\sl360\partightenfactor0 \cf2 \cb3 file_list.append(f"testfiles/filet\{i\}.txt")\cb1 \ \cb3 file_list.append(f"testfiles/filet\{i\}.txt_tags.yml")\ \pard\pardeftab720\sl360\partightenfactor0 \cf2 \cb1 \ \pard\pardeftab720\sl360\partightenfactor0 \cf2 \cb3 e.log_artifacts(file_list)\cb1 \ \cb3 time.sleep(5)\cb1 \
from cnvrg import Experiment import time import os e = Experiment() e.log_param("test_acc", 0.6) f = open("filename_06", "a") f.write("hello") f.close() e.sync(message="my commit: 06")
def train_without_cross_validation(model, train_set, test_set, project_dir, output_model_name): X_train, y_train = train_set # --- Training. model.fit(X_train, y_train) y_hat = model.predict(X_train) # y_hat is a.k.a y_pred train_acc = accuracy_score(y_train, y_hat) train_loss = mean_squared_error(y_train, y_hat) # --- Testing. X_test, y_test = test_set y_pred = model.predict(X_test) test_acc = accuracy_score(y_test, y_pred) test_loss = mean_squared_error(y_test, y_pred) exp = Experiment() exp.log_param("model", output_model_name) exp.log_param("train_acc", train_acc) exp.log_param("train_loss", train_loss) exp.log_param("test_acc", test_acc) exp.log_param("test_loss", test_loss) # Save model. output_file_name = project_dir + "/" + output_model_name if project_dir is not None else output_model_name pickle.dump(model, open(output_file_name, 'wb'))
class CSVProcessor: def __init__(self, path_to_csv, target_column=None, missing_dict=None, scale_dict=None, normalize_list=None, one_hot_list=None, output_name=None, plot_vis=False): """ :param path_to_csv: string :param target_column: string :param missing_dict: dict :param scale_dict: dict :param normalize_list: list :param one_hot_list: list :param output_name: string """ self.__cnvrg_env = True ### When testing locally, it is turned False. self.__data = pd.read_csv(path_to_csv, index_col=0) self.__target_column = ( target_column, self.__data[target_column]) if target_column is not None else ( self.__data.columns[-1], self.__data[self.__data.columns[-1]]) self.__features = [ f for f in list(self.__data.columns) if f != self.__target_column[0] ] self.__data = self.__data[ self.__features] # removes the target column. try: self.__experiment = Experiment() except cnvrg.modules.errors.UserError: self.__cnvrg_env = False self.__normalize_list = CSVProcessor.__parse_list( normalize_list) if isinstance(normalize_list, str) else normalize_list self.__one_hot_list = CSVProcessor.__parse_list( one_hot_list) if isinstance(one_hot_list, str) else one_hot_list self.__output_name = output_name if output_name is not None else path_to_csv.split( '.csv')[0] + '_processed.csv' self.__plot_vis = plot_vis ### changed to list of lists instead of dictionary: self.__scale_dict = CSVProcessor.__parse_2d_list( scale_dict) if isinstance(scale_dict, str) else scale_dict self.__missing_dict = CSVProcessor.__parse_2d_list( missing_dict) if isinstance(missing_dict, str) else missing_dict def run(self): self.__handle_missing() self.__one_hot_encoding_aka_dummy() self.__scale() self.__normalize() self.__set_target_column() self.__save() if self.__cnvrg_env: self.__plot_metrics() ### using cnvrg. self.__plot_visualization(plot_correlation=True) ### using cnvrg. self.__check_nulls_before_output() def __scale(self): scale = lambda m, r_min, r_max, t_min, t_max: (( (m - r_min) / (r_max - r_min)) * (t_max - t_min)) + t_min if self.__scale_dict is not None: scale_all = False if set(self.__scale_dict.keys()) == set('all'): scale_all = True columns_to_scale = self.__features if scale_all is True else self.__scale_dict.keys( ) for col in columns_to_scale: y, x = (self.__data[col].min(), self.__data[col].max() ) if scale_all else CSVProcessor.__scale_helper( self.__scale_dict[col]) self.__data[col] = scale(self.__data[col], self.__data[col].min(), self.__data[col].max(), y, x) def __normalize(self): if self.__normalize_list is not None: normalize_all = False if set(self.__normalize_list) == set('all'): normalize_all = True columns_to_scale = self.__features if normalize_all is True else self.__normalize_list for col in columns_to_scale: min_range, max_range = self.__data[col].min( ), self.__data[col].max() self.__data[col] -= min_range self.__data[col] /= (max_range - min_range) def __one_hot_encoding_aka_dummy(self): """ Handles dummys. """ if self.__one_hot_list is not None: self.__data = pd.get_dummies(self.__data, columns=self.__one_hot_list) def __handle_missing(self): """ Options: 1) fill_X (fill with value x) 2) drop 3) avg (fill with avg) 4) med (short of median) 5) rand_A_B (fill with random value in range [A,B] """ if self.__missing_dict is not None: handle_all, task_all = False, None if set(self.__missing_dict.keys()) == set('all'): handle_all, task_all = True, self.__missing_dict['all'] column_to_handle = self.__features if handle_all is True else self.__missing_dict.keys( ) for col in column_to_handle: task = task_all if task_all is not None else self.__missing_dict[ col] if task.startswith('fill_'): value = float(task[len('fill_'):] ) if '.' in task[len('fill_'):] else int( task[len('fill_'):]) self.__data[col] = self.__data[col].fillna(value) elif task.startswith('drop'): self.__data = self.__data[self.__data[col].notna()] elif task.startswith('avg'): self.__data[col] = self.__data[col].fillna( self.__data[col].mean()) elif task.startswith('med'): self.__data[col] = self.__data[col].fillna( self.__data[col].median()) elif task.startswith('randint_'): a, b = task[len('randint_'):].split('_') a, b = float(a) if '.' in a else int(a), float( b) if '.' in b else int(b) self.__data[col] = self.__data[col].fillna( np.random.randint(a, b)) else: raise ValueError( 'Missing Values Handling - Undefined task.') def __set_target_column(self): self.__data[self.__target_column[0]] = self.__target_column[1] def __plot_metrics(self): self.__experiment.log_param("output_file", self.__output_name) def __plot_visualization(self, plot_correlation=True): if self.__plot_vis is False: return # Tasks: if plot_correlation: self.__plot_correlation_matrix() def __save(self): self.__data.to_csv(self.__output_name) def __check_nulls_before_output(self): # Check empty and nan values to warn the user. time.sleep(8) nulls_report = dict(self.__data.isnull().sum()) features_with_null_values = [ k for k, v in nulls_report.items() if v != 0 ] # if len(features_with_null_values) != 0: # warnings.warn("Null values or empty cells in the data set.", UserWarning) return """ ------------------- """ """ ----- Helpers ----- """ """ ------------------- """ @staticmethod def __parse_2d_list(as_string): final_dict = {} trimmed = as_string.replace(' ', '') commans_idxs = [0] + [ i for i in range(1, len(trimmed)) if trimmed[i] == ',' and trimmed[i - 1] == ']' and trimmed[i + 1] == '[' ] + [len(trimmed) - 1] ### if its 0, we have single array. sub_lists = [ trimmed[commans_idxs[i - 1] + 1:commans_idxs[i]] for i in range(1, len(commans_idxs)) ] if len(commans_idxs) > 2 else [trimmed[1:-1]] for sub_list in sub_lists: parsed = CSVProcessor.__parse_list(sub_list) try: final_dict[parsed[0]] = (parsed[1], parsed[2] ) ### for scaling. except IndexError: final_dict[parsed[0]] = parsed[ 1] ### for filling empty values. return final_dict @staticmethod def __parse_list(list_as_string): if list_as_string == '[]': return [] list_without_parenthesis = list_as_string.strip()[1:-1] parsed_list = [ st.strip() for st in list_without_parenthesis.split(',') ] # Check if the values are columns numbers. try: parsed_list = [int(st) for st in parsed_list] except ValueError: pass return parsed_list @staticmethod def __parse_dict(dict_as_string): if dict_as_string == '{}': return {} final_key = dict() parsed_dict = eval(dict_as_string) if not isinstance(parsed_dict, dict): raise TypeError('Given a {} instead of dictionary.'.format( type(parsed_dict))) all_keys = parsed_dict.keys() for k in all_keys: true_key, true_value = k, parsed_dict[k].split(':') true_key = true_key.strip() final_key[true_key] = true_value return final_key @staticmethod def __scale_helper(value): min_val, max_val = value.split(':') if isinstance( value, str) else value[0], value[1] min_val = float(min_val) if '.' in min_val else int(min_val) max_val = float(max_val) if '.' in max_val else int(max_val) return min_val, max_val def __plot_correlation_matrix(self, digits_to_round=3): correlation = self.__data.corr() self.__experiment.log_chart( "Correlation", [MatrixHeatmap(np.round(correlation.values, digits_to_round))], x_ticks=correlation.index.tolist(), y_ticks=correlation.index.tolist())
class SKTrainer: DIGITS_TO_ROUND = 3 def __init__(self, model, train_set, test_set, output_model_name, testing_mode, folds=None): self.__model = model self.__x_train, self.__y_train = train_set self.__x_test, self.__y_test = test_set self.__output_model_name = output_model_name self.__testing_mode = testing_mode self.__cross_val_folds = folds self.__is_cross_val = (folds is not None) self.__features = list(self.__x_train.columns) self.__labels = [str(l) for l in list(set(self.__y_train).union(set(self.__y_test)))] self.__model.fit(self.__x_train, self.__y_train) self.__importance = self.__model.feature_importances_ self.__experiment = Experiment() self.__metrics = {'model': self.__output_model_name} if self.__is_cross_val: self.__metrics['folds'] = self.__cross_val_folds def run(self): """ runs the training & testing methods. """ if self.__is_cross_val is True: self.__train_with_cross_validation() else: self.__train_without_cross_validation() """training & testing methods""" def __train_with_cross_validation(self): """ This method enables sk-learn algorithms to perform KFold-cross-validation. The method also initiates the cnvrg experiment with all its metrics. """ train_acc, train_loss = [], [] kf = KFold(n_splits=self.__cross_val_folds) for train_index, val_index in kf.split(self.__x_train): X_train, X_val = self.__x_train.iloc[train_index, :], self.__x_train.iloc[val_index, :] y_train, y_val = self.__y_train.iloc[train_index], self.__y_train.iloc[val_index] self.__model = self.__model.fit(X_train, y_train) y_hat = self.__model.predict(X_val) # y_hat is a.k.a y_pred acc = accuracy_score(y_val, y_hat) loss = mean_squared_error(y_val, y_hat) train_acc.append(acc) train_loss.append(loss) # --- Testing. y_pred = self.__model.predict(self.__x_test) test_acc = accuracy_score(self.__y_test, y_pred) test_loss = mean_squared_error(self.__y_test, y_pred) self.__metrics.update({ 'test_acc': test_acc, 'test_loss': test_loss }) self.__plot_all(y_pred) def __train_without_cross_validation(self): """ The method also initiates the cnvrg experiment with all its metrics. """ y_hat = self.__model.predict(self.__x_train) # y_hat is a.k.a y_pred train_acc = accuracy_score(self.__y_train, y_hat) train_loss = mean_squared_error(self.__y_train, y_hat) y_pred = self.__model.predict(self.__x_test) test_acc = accuracy_score(self.__y_test, y_pred) test_loss = mean_squared_error(self.__y_test, y_pred) self.__metrics.update({ 'train_acc': train_acc, 'train_loss': train_loss, 'test_acc': test_acc, 'test_loss': test_loss }) self.__plot_all(y_pred) """Plotting methods""" def __plot_feature_importance(self): if self.__testing_mode is False: self.__experiment.log_chart('Feature Importance', x_axis='Features', y_axis='Importance', data=Bar(x=self.__features, y=self.__importance)) else: print(self.__importance) def __plot_classification_report(self, y_test_pred): test_report = classification_report(self.__y_test, y_test_pred, output_dict=True) # dict if self.__testing_mode is False: testing_report_as_array = self.__helper_plot_classification_report(test_report) self.__experiment.log_chart("Test Set - Classification Report", data=Heatmap(z=testing_report_as_array), y_ticks=self.__labels, x_ticks=["precision", "recall", "f1-score", "support"]) else: print(test_report) def __helper_plot_classification_report(self, classification_report_dict): """ Converts dictionary given by classification_report to list of lists. """ rows = [] for k, v in classification_report_dict.items(): if k in self.__labels: rows.append(list(v.values())) values = [] for y in range(len(rows)): for x in range(len(rows[y])): values.append((x, y, round(rows[y][x], SKTrainer.DIGITS_TO_ROUND))) return values def __plot_confusion_matrix(self, y_test_pred=None): if self.__y_test is not None and y_test_pred is not None: confusion_mat_test = confusion_matrix(self.__y_test, y_test_pred) # array confusion_mat_test = self.__helper_plot_confusion_matrix(confusion_mat_test) if self.__testing_mode is False: self.__experiment.log_chart("Test Set - confusion matrix", data=Heatmap(z=confusion_mat_test)) else: print(confusion_mat_test) def __helper_plot_confusion_matrix(self, confusion_matrix): output = [] for y in range(len(confusion_matrix)): for x in range(len(confusion_matrix[y])): output.append((x, y, round(float(confusion_matrix[x][y]), SKTrainer.DIGITS_TO_ROUND))) return output def __plot_roc_curve(self, y_test_pred): n_classes = len(self.__labels) y_test = self.__y_test.tolist() y_test_pred = y_test_pred.tolist() if n_classes != 2 or self.__testing_mode is True: return y_test, y_test_pred = list(y_test), list(y_test_pred) FPRs, TPRs, _ = roc_curve(y_test, y_test_pred) self.__experiment.log_metric(key='ROC curve', Ys=TPRs.tolist(), Xs=FPRs.tolist()) def __plot_pandas_analyzer(self): data = pd.concat([pd.concat([self.__x_train, self.__x_test], axis=0), pd.concat([self.__y_train, self.__y_test], axis=0)], axis=1) if self.__testing_mode is False: PandasAnalyzer(data, experiment=self.__experiment) def __plot_accuracies_and_errors(self): self.__plot_accuracies_and_errors_helper() if self.__testing_mode is True: print("Model: {model}\n" "train_acc={train_acc}\n" "train_loss={train_loss}\n" "test_acc={test_acc}\n" "test_loss={test_loss}".format( model=self.__metrics['model'], train_acc=self.__metrics['train_acc'], train_loss=self.__metrics['train_loss'], test_acc=self.__metrics['test_acc'], test_loss=self.__metrics['test_loss'])) if self.__is_cross_val is True: print("Folds: {folds}\n".format(folds=self.__metrics['folds'])) else: # testing_mode is False self.__experiment.log_param("model", self.__metrics['model']) self.__experiment.log_param("test_acc", self.__metrics['test_acc']) self.__experiment.log_param("test_loss", self.__metrics['test_loss']) if self.__is_cross_val is True: self.__experiment.log_param("folds", self.__metrics['folds']) self.__experiment.log_metric("train_acc", self.__metrics['train_acc']) self.__experiment.log_metric("train_loss", self.__metrics['train_loss']) return self.__experiment.log_param("train_acc", self.__metrics['train_acc']) self.__experiment.log_param("train_loss", self.__metrics['train_loss']) def __plot_accuracies_and_errors_helper(self): """Rounds all the values in self.__metrics""" keys_to_round = ['train_acc', 'train_loss', 'test_acc', 'test_loss'] for key in keys_to_round: self.__metrics[key] = round(self.__metrics[key], SKTrainer.DIGITS_TO_ROUND) def __plot_all(self, y_test_pred): """ Runs all the plotting methods. """ self.__plot_pandas_analyzer() self.__plot_feature_importance() self.__plot_classification_report(y_test_pred=y_test_pred) self.__plot_confusion_matrix(y_test_pred=y_test_pred) self.__plot_roc_curve(y_test_pred=y_test_pred) self.__plot_accuracies_and_errors() self.__save_model() """technical methods""" def __save_model(self): output_file_name = os.environ.get("CNVRG_PROJECT_PATH") + "/" + self.__output_model_name if os.environ.get("CNVRG_PROJECT_PATH") \ is not None else self.__output_model_name pickle.dump(self.__model, open(output_file_name, 'wb')) if not self.__testing_mode: os.system("ls -la {}".format(os.environ.get("CNVRG_PROJECT_PATH")))
class NbConverter: def __init__(self, input, to, template, inplace, allow_errors): self.__cnvrg_env = True # When testing locally, it is turned False. self.input = input self.to = to self.template = template self.inplace = inplace self.allow_errors = allow_errors try: self.__experiment = Experiment() except: self.__cnvrg_env = False if self.__cnvrg_env: self.__experiment.log_param("template", template) def run(self): if self.__cnvrg_env: self.__experiment.log("Configuring nbconvert options") run_string = '' if self.allow_errors is False: if self.template is None: if self.to != 'notebook': run_string = "jupyter nbconvert --to {} {}".format( self.to, self.input) elif self.inplace is True and self.to == 'notebook': run_string = "jupyter nbconvert --inplace --to {} {}".format( self.to, self.input) else: run_string = "jupyter nbconvert --to notebook {}".format( self.input) else: run_string = "jupyter nbconvert --to {} -template {} {}".format( self.to, self.template, self.input) else: if self.template is None: if self.to != 'notebook': run_string = "jupyter nbconvert --allow-errors --to {} {}".format( self.to, self.input) elif self.inplace is True and self.to == 'notebook': run_string = "jupyter nbconvert --allow-errors --inplace --to {} {}".format( self.to, self.input) else: run_string = "jupyter nbconvert --allow-errors --to notebook {}".format( self.input) else: run_string = "jupyter nbconvert --allow-errors --to {} -template {} {}".format( self.to, self.template, self.input) log_string = "Running command: {}".format(run_string) run_list = run_string.split(' ') dir = '/cnvrg' if self.__cnvrg_env: self.__experiment.log(log_string) try: subprocess.call(run_list, cwd=dir) except OSError: print( 'jupyter nbconvert was unsuccessful. Please check your file path and parameters.' ) exit(1) if self.__cnvrg_env: self.__experiment.log("Conversion finished")
import time import os from cnvrg import Experiment i = 0 while True: filename = "test-{file_idx}.log".format(file_idx=i) f = open(filename, "a") f.write("hello") f.close() Experiment.sync(message="my commit: %d" % i) time.sleep(60) i += 1
class SKTrainer: def __init__(self, model, train_set, test_set, output_model_name, testing_mode, folds=None): self.__model = model self.__x_train, self.__y_train = train_set self.__x_test, self.__y_test = test_set self.__all_data_concatenated = pd.concat([pd.concat([self.__x_train, self.__x_test], axis=0), pd.concat([self.__y_train, self.__y_test], axis=0)], axis=1) self.__testing_mode = testing_mode self.__cross_val_folds = folds self.__is_cross_val = (folds is not None) self.__features = list(self.__x_train.columns) self.__labels = [str(l) for l in list(set(self.__y_train).union(set(self.__y_test)))] self.__metrics = {'model': output_model_name, 'train set size': len(self.__y_train), 'test set size': len(self.__y_test)} self.__experiment = Experiment() def run(self): """ runs the training & testing methods. """ self.__model.fit(self.__x_train.values, self.__y_train.values) if self.__is_cross_val: self.__metrics['folds'] = self.__cross_val_folds if self.__is_cross_val is True: self.__train_with_cross_validation() else: self.__train_without_cross_validation() self.__save_model() def __plot_all(self, y_test_pred): """ This method controls the visualization and metrics outputs. Hashtag something which you don't want to plot. """ self.__plot_correlation_matrix() # self.__plot_feature_vs_feature() self.__plot_feature_importance() self.__plot_classification_report(y_test_pred=y_test_pred) self.__plot_confusion_matrix(y_test_pred=y_test_pred) self.__plot_roc_curve(y_test_pred=y_test_pred) self.__plot_accuracies_and_errors() """training & testing methods""" def __train_with_cross_validation(self): """ This method enables sk-learn algorithms to perform KFold-cross-validation. The method also initiates the cnvrg experiment with all its metrics. """ scores = cross_validate(estimator=self.__model, X=self.__x_train.values, y=self.__y_train.values, cv=self.__cross_val_folds, return_train_score=True, scoring=['neg_mean_squared_error', 'accuracy'], return_estimator=True) train_acc_cv = scores['train_accuracy'] train_err_cv = (-1) * scores['train_neg_mean_squared_error'] val_acc_cv = scores['test_accuracy'] val_err_cv = (-1) * scores['test_neg_mean_squared_error'] self.__model = scores['estimator'][-1] y_pred = self.__model.predict(self.__x_test.values) test_acc = accuracy_score(self.__y_test.values, y_pred) test_loss = zero_one_loss(self.__y_test.values, y_pred) self.__metrics.update({ 'train_acc': train_acc_cv, 'train_loss': train_err_cv, 'train_loss_type': 'MSE', 'validation_acc': val_acc_cv, 'validation_loss': val_err_cv, 'validation_loss_type': 'MSE', 'test_acc': test_acc, 'test_loss': test_loss, 'test_loss_type': 'zero_one_loss' }) self.__plot_all(y_pred) def __train_without_cross_validation(self): """ The method also initiates the cnvrg experiment with all its metrics. """ y_hat = self.__model.predict(self.__x_train.values) # y_hat is a.k.a y_pred train_acc = accuracy_score(self.__y_train, y_hat) train_loss = zero_one_loss(self.__y_train, y_hat) y_pred = self.__model.predict(self.__x_test.values) test_acc = accuracy_score(self.__y_test, y_pred) test_loss = zero_one_loss(self.__y_test, y_pred) self.__metrics.update({ 'train_acc': train_acc, 'train_loss': train_loss, 'train_loss_type': 'zero_one_loss', 'test_acc': test_acc, 'test_loss': test_loss, 'test_loss_type': 'zero_one_loss' }) self.__plot_all(y_pred) def __plot_feature_importance(self): try: importance = getattr(self.__model, "feature_importances_") if self.__testing_mode is False: self.__experiment.log_chart('Feature Importance', x_axis='Features', y_axis='Importance', data=Bar(x=self.__features, y=importance)) else: print(importance) except AttributeError: pass def __plot_classification_report(self, y_test_pred): test_report = classification_report(self.__y_test, y_test_pred, output_dict=True) # dict if self.__testing_mode is False: testing_report_as_array = self.__helper_plot_classification_report(test_report) self.__experiment.log_chart("Test Set - Classification Report", data=Heatmap(z=testing_report_as_array), y_ticks=self.__labels, x_ticks=["precision", "recall", "f1-score", "support"]) else: print(test_report) def __plot_confusion_matrix(self, y_test_pred=None): """ Plots the confusion matrix. """ if self.__y_test is not None and y_test_pred is not None: confusion_mat_test = confusion_matrix(self.__y_test, y_test_pred) # array confusion_mat_test = SKTrainer.__helper_plot_confusion_matrix(confusion_mat_test) if self.__testing_mode is False: self.__experiment.log_chart("Test Set - confusion matrix", data=Heatmap(z=confusion_mat_test)) else: print(confusion_mat_test) def __plot_roc_curve(self, y_test_pred): if len(set(self.__y_test)) != 2: return fpr, tpr, _ = roc_curve(self.__y_test, y_test_pred) if self.__testing_mode is False: self.__experiment.log_metric(key='ROC curve', Ys=tpr.tolist(), Xs=fpr.tolist()) else: print("FPRs: {fpr}\nTPRs: {tpr}".format(fpr=fpr, tpr=tpr)) def __plot_correlation_matrix(self): data = self.__all_data_concatenated correlation = data.corr() self.__experiment.log_chart("correlation", [MatrixHeatmap(np.round(correlation.values, 2))], x_ticks=correlation.index.tolist(), y_ticks=correlation.index.tolist()) def __plot_feature_vs_feature(self): data = self.__all_data_concatenated indexes = data.select_dtypes(include=["number"]).columns corr = data.corr() for idx, i in enumerate(indexes): for jdx, j in enumerate(indexes): if i == j: continue if jdx < idx: continue corr_val = abs(corr[i][j]) if 1 == corr_val or corr_val < 0.5: continue droplines = data[[i, j]].notnull().all(1) x, y = data[droplines][[i, j]].values.transpose() self.__experiment.log_chart("{i}_against_{j}".format(i=i, j=j), [Scatterplot(x=x.tolist(), y=y.tolist())], title="{i} against {j}".format(i=i, j=j)) def __plot_accuracies_and_errors(self): self.__plot_accuracies_and_errors_helper_rounding() if self.__testing_mode is True: self.__plot_accuracies_and_errors_helper_testing_mode() for p in ['model', 'test_acc', 'test_loss', 'test_loss_type', 'train set size', 'test set size', 'train_loss_type']: self.__experiment.log_param(p, self.__metrics[p]) if self.__is_cross_val is True: self.__experiment.log_param("folds", self.__metrics['folds']) self.__experiment.log_param("validation_loss_type", self.__metrics['validation_loss_type']) metrics = ['train_acc', 'train_loss', 'validation_acc', 'validation_loss'] for m in metrics: self.__experiment.log_metric(m, self.__metrics[m], grouping=[m] * len(self.__metrics[m])) return self.__experiment.log_param("train_acc", self.__metrics['train_acc']) self.__experiment.log_param("train_loss", self.__metrics['train_loss']) self.__experiment.log_param("train_loss_type", self.__metrics['train_loss_type']) def __save_model(self): output_model_name = self.__metrics['model'] output_file_name = os.environ.get("CNVRG_WORKDIR") + "/" + output_model_name if os.environ.get("CNVRG_WORKDIR") is not None else output_model_name pickle.dump(self.__model, open(output_file_name, 'wb')) """ --- Helpers --- """ @staticmethod def __helper_plot_confusion_matrix(confusion_matrix, digits_to_round=3): output = [] for y in range(len(confusion_matrix)): for x in range(len(confusion_matrix[y])): output.append((x, y, round(float(confusion_matrix[x][y]), digits_to_round))) return output def __plot_accuracies_and_errors_helper_rounding(self, digits_to_round=3): for key in self.__metrics.keys(): # Skip strings. if isinstance(self.__metrics[key], str): continue # Lists & Arrays. elif isinstance(self.__metrics[key], list) or isinstance(self.__metrics[key], np.ndarray): if isinstance(self.__metrics[key], np.ndarray): self.__metrics[key] = self.__metrics[key].tolist() for ind in range(len(self.__metrics[key])): self.__metrics[key][ind] = round(self.__metrics[key][ind], digits_to_round) # int & floats. else: self.__metrics[key] = round(self.__metrics[key], digits_to_round) def __plot_accuracies_and_errors_helper_testing_mode(self, digits_to_round=3): print("Model: {model}\n" "train_acc={train_acc}\n" "train_loss={train_loss}\n" "test_acc={test_acc}\n" "test_loss={test_loss}".format( model=self.__metrics['model'], train_acc=self.__metrics['train_acc'], train_loss=self.__metrics['train_loss'], test_acc=self.__metrics['test_acc'], test_loss=self.__metrics['test_loss'])) if self.__is_cross_val is True: print("Folds: {folds}\n".format(folds=self.__metrics['folds'])) def __helper_plot_classification_report(self, classification_report_dict, digits_to_round=3): """ Converts dictionary given by classification_report to list of lists. """ rows = [] for k, v in classification_report_dict.items(): if k in self.__labels: rows.append(list(v.values())) values = [] for y in range(len(rows)): for x in range(len(rows[y])): values.append((x, y, round(rows[y][x], digits_to_round))) return values
class TensorflowTrainer: GRAYSCALE_CHANNELS, RGB_CHANNELS = 1, 3 VERBOSE = 1 WORKERS = 3 fully_connected_layers = [1024, 512, 256] def __init__(self, arguments, model_name, base_model): self.__cnvrg_env = True self.__arguments = cast_input_types(arguments) self.__shape = (arguments.image_height, arguments.image_width) self.__classes = parse_classes(arguments.data) self.__channels = TensorflowTrainer.RGB_CHANNELS if arguments.image_color == 'rgb' \ else TensorflowTrainer.GRAYSCALE_CHANNELS self.__model = ModelGenerator( base_model=base_model, num_of_classes=len(self.__classes), fully_connected_layers=TensorflowTrainer.fully_connected_layers, loss_function=arguments.loss, dropout=arguments.dropout, activation_hidden_layers=arguments.hidden_layer_activation, activation_output_layers=arguments.output_layer_activation, optimizer=arguments.optimizer).get_model() try: self.__experiment = Experiment() except cnvrg.modules.UserError: self.__cnvrg_env = False self.__metrics = { 'tensorflow local version': tf.__version__, 'GPUs found': len(tf.config.experimental.list_physical_devices('GPU')), 'Model': model_name, 'Classes list': self.__classes } def run(self): if self.__cnvrg_env: self.__plot_all(status='pre-training') ### using cnvrg. self.__train() self.__test() if self.__cnvrg_env: self.__plot_all() ### using cnvrg. self.__export_model() ### using cnvrg. def __plot_all(self, status='post-test'): if status == 'pre-training': self.__plot_metrics(status='pre-training') elif status == 'post-test' and self.__arguments.data_test is not None: self.__plot_metrics(status='post-test') self.__plot_confusion_matrix(self.__labels, self.__predictions) def __train(self): train_generator, val_generator = load_generator( self.__arguments.data, self.__shape, self.__arguments.test_size, self.__arguments.image_color, self.__arguments.batch_size) steps_per_epoch_training = self.__arguments.steps_per_epoch steps_per_epoch_validation = self.__arguments.steps_per_epoch start_time = time.time() time_callback = TimeHistory() print("---start training---") self.__model.fit(train_generator, epochs=self.__arguments.epochs, workers=multiprocessing.cpu_count() - 1, verbose=TensorflowTrainer.VERBOSE, steps_per_epoch=steps_per_epoch_training, validation_data=val_generator, validation_steps=steps_per_epoch_validation, use_multiprocessing=True, callbacks=[time_callback]) print("---End training---") training_time = time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)) self.__metrics['training_time'] = training_time if self.__cnvrg_env: self.__experiment.log_metric( key="Epoch Times", Ys=time_callback.times, Xs=[i for i in range(1, self.__arguments.epochs + 1)], x_axis="Epoch", y_axis="Time (Seconds)") def __test(self): if self.__arguments.data_test is None: return test_gen = load_generator(self.__arguments.data_test, self.__shape, image_color=self.__arguments.image_color, batch_size=self.__arguments.batch_size, generate_test_set=True) self.__predictions = np.argmax(self.__model.predict(test_gen), axis=1) self.__labels = test_gen.classes steps_per_epoch_testing = test_gen.n test_loss, test_acc = self.__model.evaluate_generator( test_gen, workers=TensorflowTrainer.WORKERS, verbose=TensorflowTrainer.VERBOSE, steps=steps_per_epoch_testing) test_acc, test_loss = round(float(test_acc), 3), round(float(test_loss), 3) self.__metrics['test_acc'] = test_acc self.__metrics['test_loss'] = test_loss def __export_model(self): output_file_name = os.environ.get("CNVRG_WORKDIR") + "/" + self.__arguments.output_model if os.environ.get("CNVRG_WORKDIR") is not None \ else self.__arguments.output_model self.__model.save(output_file_name) export_labels_dictionary_from_classes_list(self.__classes) """ Cnvrg metrics output """ def __plot_metrics(self, status='pre-training'): """ :param training_status: (String) either 'pre' or 'post'. """ if status == 'pre-training': print('Plotting pre-training metrics:') for k, v in self.__metrics.items(): if k not in ['test_acc', 'test_loss']: self.__experiment.log_param(k, v) elif status == 'post-test': print('Plotting post-test metrics:') for k, v in self.__metrics.items(): if k in ['test_acc', 'test_loss']: self.__experiment.log_param(k, v) else: raise ValueError('Unrecognized status.') def __plot_confusion_matrix(self, labels, predictions): """ Plots the confusion matrix. """ confusion_mat_test = confusion_matrix(labels, predictions) # array confusion_mat_test = TensorflowTrainer.__helper_plot_confusion_matrix( confusion_mat_test, mat_x_ticks=self.__classes, mat_y_ticks=self.__classes) self.__experiment.log_chart("confusion matrix", data=Heatmap(z=confusion_mat_test)) @staticmethod def __helper_plot_confusion_matrix(confusion_matrix, mat_x_ticks=None, mat_y_ticks=None, digits_to_round=3): """ :param confusion_matrix: the values in the matrix. :param mat_x_ticks, mat_y_ticks: ticks for the axis of the matrix. """ output = [] for y in range(len(confusion_matrix)): for x in range(len(confusion_matrix[y])): x_val = x if mat_x_ticks is None else mat_x_ticks[x] y_val = y if mat_y_ticks is None else mat_y_ticks[y] output.append((x_val, y_val, round(float(confusion_matrix[x][y]), digits_to_round))) return output
batch_size=int(round(args.test_batch_size)), shuffle=True, **kwargs) model = Net().to(device) # Load checkpoint if args.ckpf != '': if use_cuda: model.load_state_dict(torch.load(args.ckpf)) else: # Load GPU model on CPU model.load_state_dict(torch.load(args.ckpf, map_location=lambda storage, loc: storage)) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) try: e = Experiment() except: e = Experiment.init() def train(args, model, device, train_loader, optimizer, epoch): """Training""" model.train() tot_loss = 0 for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target)
class TensorflowTrainer: GRAYSCALE_CHANNELS = 1 RGB_CHANNELS = 3 VERBOSE = 1 WORKERS = 3 fully_connected_layers = [1024, 512, 256] METRICS = { 'pre-training': [ 'TensorFlow version', 'GPUs found', 'Model', # 'Classes list' ], 'post-training': [ 'training_time', # 'epochs_duration', # 'avg_time_per_epoch', # 'time_per_step' ], 'post-test': ['test_acc', 'test_loss'] } def __init__(self, arguments, model_name, base_model): self.__cnvrg_env = True self.__arguments = arguments self.__shape = (arguments.image_height, arguments.image_width) self.__classes = parse_classes(arguments.data) self.__channels = TensorflowTrainer.RGB_CHANNELS if arguments.image_color == 'rgb' \ else TensorflowTrainer.GRAYSCALE_CHANNELS self.__model = ModelGenerator( base_model=base_model, num_of_classes=len(self.__classes), fully_connected_layers=TensorflowTrainer.fully_connected_layers, loss_function=arguments.loss, dropout=arguments.dropout, activation_hidden_layers=arguments.hidden_layer_activation, activation_output_layers=arguments.output_layer_activation, optimizer=arguments.optimizer).get_model() try: print("Trying to launch an experiment in cnvrg environment.") self.__experiment = Experiment() except Exception: print("Not in cnvrg environment.") self.__cnvrg_env = False self.__metrics = { 'TensorFlow version': tf.__version__, 'GPUs found': len(tf.config.experimental.list_physical_devices('GPU')), 'Model': model_name, 'Classes list': self.__classes } def run(self): self.__plot(status='pre-training') self.__train() self.__plot(status='post-training') self.__test() self.__plot(status='post-test') self.__export_model() def __plot(self, status): if status == 'pre-training': self.__plot_metrics(status='pre-training') elif status == 'post-training': self.__plot_metrics(status='post-training') elif status == 'post-test' and self.__arguments.data_test is not None: self.__plot_metrics(status='post-test') self.__plot_confusion_matrix(self.__labels, self.__predictions) def __train(self): train_generator, val_generator = load_generator( self.__arguments.data, self.__shape, self.__arguments.test_size, # test_size = validation_split self.__arguments.image_color, self.__arguments.batch_size) start_time = time.time() time_callback = TimeHistory() print("--- Starts Training ---") from PIL import ImageFile ImageFile.LOAD_TRUNCATED_IMAGES = True self.__model.fit(train_generator, epochs=self.__arguments.epochs, verbose=self.__arguments.verbose, steps_per_epoch=self.__arguments.steps_per_epoch, validation_data=val_generator if self.__arguments.test_size != 0. else None, validation_steps=self.__arguments.steps_per_epoch if self.__arguments.test_size != 0. else None, callbacks=[time_callback]) print("--- Ends training ---") training_time = time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)) self.__metrics['training_time'] = training_time self.__metrics['epochs_duration'] = Metric(key='Epochs Duration', Ys=time_callback.times, Xs='from_1', x_axis='epochs', y_axis='time (seconds)') self.__metrics['avg_time_per_epoch'] = round( sum(time_callback.times) / len(time_callback.times), 3) if self.__arguments.steps_per_epoch is not None: self.__metrics['time_per_step'] = Metric( key='Time per Step', Ys=[ round( time_callback.times[i] / self.__arguments.steps_per_epoch, 3) for i in range(self.__arguments.epochs) ], Xs='from_1', x_axis='epochs', y_axis='time (ms)/step') def __test(self): if self.__arguments.data_test is None: return test_gen = load_generator(self.__arguments.data_test, self.__shape, image_color=self.__arguments.image_color, batch_size=self.__arguments.batch_size, generate_test_set=True) self.__predictions = np.argmax(self.__model.predict(test_gen), axis=1) self.__labels = test_gen.classes steps_per_epoch_testing = test_gen.n test_loss, test_acc = self.__model.evaluate_generator( test_gen, workers=TensorflowTrainer.WORKERS, verbose=TensorflowTrainer.VERBOSE, steps=steps_per_epoch_testing) test_acc, test_loss = round(float(test_acc), 3), round(float(test_loss), 3) self.__metrics['test_acc'] = test_acc self.__metrics['test_loss'] = test_loss def __export_model(self): output_file_name = os.environ.get("CNVRG_WORKDIR") + "/" + self.__arguments.output_model if os.environ.get("CNVRG_WORKDIR") is not None \ else self.__arguments.output_model self.__model.save(output_file_name) export_labels_dictionary_from_classes_list(self.__classes) # ============ Helpers ============ def __plot_metrics(self, status): metrics = TensorflowTrainer.METRICS[status] if status == 'pre-training': for metric in metrics: if self.__cnvrg_env: if metric in self.__metrics.keys(): # if metric exists self.__experiment.log_param(metric, self.__metrics[metric]) else: print("log_param - {key} : {value}".format( key=metric, value=self.__metrics[metric])) elif status == 'post-training': for metric in metrics: if metric in self.__metrics.keys(): # if metric exists if not isinstance(self.__metrics[metric], Metric): # param if self.__cnvrg_env: self.__experiment.log_param( metric, self.__metrics[metric]) else: print("log_param - {key} : {value}".format( key=metric, value=self.__metrics[metric])) else: # metrics should be called here. if self.__cnvrg_env: self.__experiment.log_metric( key=self.__metrics[metric].key, Ys=self.__metrics[metric].Ys, Xs=self.__metrics[metric].Xs, x_axis=self.__metrics[metric].x_axis, y_axis=self.__metrics[metric].y_axis) else: print(self.__metrics[metric]) elif status == 'post-test': for metric in metrics: if metric in self.__metrics.keys(): # if metric exists if self.__cnvrg_env: self.__experiment.log_param(metric, self.__metrics[metric]) else: print("log_param - {key} : {value}".format( key=metric, value=self.__metrics[metric])) else: raise ValueError('Unrecognized status.') def __plot_confusion_matrix(self, labels, predictions): """ Plots the confusion matrix. """ confusion_mat_test = confusion_matrix(labels, predictions) # array confusion_mat_test = TensorflowTrainer.__helper_plot_confusion_matrix( confusion_mat_test, mat_x_ticks=self.__classes, mat_y_ticks=self.__classes) self.__experiment.log_chart("confusion matrix", data=Heatmap(z=confusion_mat_test)) @staticmethod def __helper_plot_confusion_matrix(confusion_matrix, mat_x_ticks=None, mat_y_ticks=None, digits_to_round=3): """ :param confusion_matrix: the values in the matrix. :param mat_x_ticks, mat_y_ticks: ticks for the axis of the matrix. """ output = [] for y in range(len(confusion_matrix)): for x in range(len(confusion_matrix[y])): x_val = x if mat_x_ticks is None else mat_x_ticks[x] y_val = y if mat_y_ticks is None else mat_y_ticks[y] output.append((x_val, y_val, round(float(confusion_matrix[x][y]), digits_to_round))) return output
def train_with_cross_validation(model, train_set, test_set, folds, project_dir, output_model_name): train_acc, train_loss = [], [] kf = KFold(n_splits=folds) X, y = train_set # --- Training. for train_index, val_index in kf.split(X): X_train, X_val = X.iloc[train_index, :], X.iloc[val_index, :] y_train, y_val = y.iloc[train_index], y.iloc[val_index] model.fit(X_train, y_train) model.n_estimators += 1 y_hat = model.predict(X_val) # y_hat is a.k.a y_pred acc = accuracy_score(y_val, y_hat) loss = mean_squared_error(y_val, y_hat) train_acc.append(acc) train_loss.append(loss) # --- Testing. X_test, y_test = test_set y_pred = model.predict(X_test) test_acc = accuracy_score(y_test, y_pred) test_loss = mean_squared_error(y_test, y_pred) exp = Experiment() exp.log_param("model", output_model_name) exp.log_param("folds", folds) exp.log_metric("train_acc", train_acc) exp.log_metric("train_loss", train_loss) exp.log_param("test_acc", test_acc) exp.log_param("test_loss", test_loss) # Save model. output_file_name = project_dir + "/" + output_model_name if project_dir is not None else output_model_name pickle.dump(model, open(output_file_name, 'wb'))