def generate_categories(): # capture the config path from the run arguments then process the json configuration file try: args = get_args() config = process_config(args.config) except ValueError: print("Missing or invalid arguments") exit(0) print("Logging experiment name: {name}".format( name=config.experiment.experiment_name)) experiment = Experiment(api_key=config.experiment.api_key, project_name=config.experiment.project_name, workspace=config.experiment.workspace) experiment.set_name(config.experiment.experiment_name) print('Creating the data loader...') data_loader = DataLoader(config.defects_summarizer.paths) train_data, test_data = data_loader.get_data() print('Creating the Preprocessor...') preprocessor = CorexPreprocessor(train_data, config) preprocessor.prepare_data() print('Loading and evaluating the Model...') model = CorexModel(config.defects_summarizer, preprocessor, seed=False) trainer = CorexTrainer(model, preprocessor.get_data()) trainer.train() trainer.generate_topics() top_docs_df = trainer.get_top_documents( config.defects_summarizer.evaluate.extract_topics, preprocessor.get_raw_corpus(), config.defects_summarizer.evaluate.extraction_quantile, labels=True) top_docs_df.to_csv(config.defects_summarizer.paths.save_data_path) print('Saving the trained topic model...') model.save() print('Preprocessing the summarizer...') summary_preprocessor = TextRankPreprocessor( top_docs_df, n_docs=config.defects_summarizer.evaluate.n_docs) summary_preprocessor.prepare_data() print('Loading and evaluating the summarizer...') summary_model = TextRankModel(config) summary_trainer = TextRankTrainer(summary_model, summary_preprocessor) avg_prec, avg_recall, avg_f1 = summary_trainer.train_and_evaluate( test_data) # Log the rest of the experiment metrics = {"precision": avg_prec, "recall": avg_recall, "f1": avg_f1} experiment.log_metrics(metrics) experiment.log_model( name=config.experiment.model_name, file_or_folder=config.labels_generator.paths.save_model_path)
def generate_topics(): # capture the config path from the run arguments then process the json configuration file try: args = get_args() config = process_config(args.config) except ValueError: print("Missing or invalid arguments") exit(0) print("Logging experiment name: {name}".format( name=config.experiment.experiment_name)) experiment = Experiment(api_key=config.experiment.api_key, project_name=config.experiment.project_name, workspace=config.experiment.workspace) experiment.set_name(config.experiment.experiment_name) params = config.labels_generator.model experiment.log_parameters(params) print('Creating the data loader...') data_loader = DataLoader(config.labels_generator.paths) data = data_loader.get_data() print('Creating the Preprocessor...') preprocessor = CorexPreprocessor(data, config) preprocessor.prepare_data() print('Creating and training the Model...') model = CorexModel(config, preprocessor) trainer = CorexTrainer(model, preprocessor.get_data()) trainer.train() print('Evaluating the model...') coherence_lst, avg_coherence = trainer.evaluate(preprocessor.get_data(), preprocessor.get_corpus()) trainer.generate_topics() print("Coherence score: {score_lst} \nAvg coherence score: {avg_score}". format(score_lst=coherence_lst, avg_score=avg_coherence)) print('Saving the trained model...') model.save() # Log the rest of the experiment metrics = {"coherence": avg_coherence} experiment.log_metrics(metrics) experiment.log_model( name=config.experiment.model_name, file_or_folder=config.labels_generator.paths.save_model_path)
if args.Xlrnrate: args.lrnrate = .2 if args.Xoptimizer: args.optimizer = 'sgd' if args.Xnet: args.net = 'ConvNet' args.lrnrate = .05 print(args) if rank == 0: experiment = Experiment(project_name='metapoison-victim', auto_param_logging=False, auto_metric_logging=False) experiment.log_parameters(vars(args)) experiment.log_parameter('nmeta', nmeta) experiment.set_name(args.key) experiment.add_tag(args.tag) experiment.log_model("CIFAR10", "../models/victim_model_1_run") # args.gpu = set_available_gpus(args) # again, hardcode the number of the GPU to avoid the error args.gpu = [1] if args.name == '': args.name = args.net def victim(): def comet_pull_next_poison(): # grab next poison from comet that hasn't been processed impatience = 0 # while not has_exitflag(args.key, api) or impatience < 5: # patience before ending victim process # Get rid off has_exitflag condition and also only keep the impagtience condition while impatience < 5: # patience before ending victim process sleep(1)
if args.comet_track: experiment.log_metric("best_valid_performance", best_valid_performance) print('Stopped at epoch {}.\n'.format(epoch+1)) if args.comet_track: experiment.log_metric("last_epoch", epoch) elapsed_learn = time.time() - start_learn print('Learning took {0[0]:.0f} min {0[1]:.0f} secs. \n'.format( divmod(elapsed_learn, 60))) # load best model during training model.load_state_dict(torch.load('models/'+model_name+'_bestmodel.pt')) model.eval() if args.comet_track: experiment.log_model('best_model', 'models/'+model_name+'_bestmodel.pt') # check how well it does on test set with experiment.test(): test_loss, preds, golds = test(test_loader, model, my_device, label_pad_id, seq2seq, dataset='test ') test_acc, test_f1 = evaluate_nn(golds, preds, id2label) if args.comet_track: experiment.log_metric("nl_loss", test_loss) experiment.log_metric("acc", test_acc) experiment.log_metric("f1", test_f1) ############################ EVALUATING INCREMENTALITY ####################### if not args.only_training: print('Incremental processing evaluation started.')
def run_experiment(output_path, _config, exp: Experiment): exp.log_parameters(flatten_params(_config)) save(os.path.join(output_path, "config.json"), _config) ensuredir(output_path) if _config["train_data"] == "mpii_train": print("Training data is mpii-train") train_data = Mpi3dTrainDataset( _config["pose2d_type"], _config["pose3d_scaling"], _config["cap_25fps"], _config["stride"], ) elif _config["train_data"] == "mpii+muco": print("Training data is mpii-train and muco_temp concatenated") mpi_data = Mpi3dTrainDataset( _config["pose2d_type"], _config["pose3d_scaling"], _config["cap_25fps"], _config["stride"], ) muco_data = PersonStackedMucoTempDataset(_config["pose2d_type"], _config["pose3d_scaling"]) train_data = ConcatPoseDataset(mpi_data, muco_data) elif _config["train_data"].startswith("muco_temp"): train_data = PersonStackedMucoTempDataset(_config["pose2d_type"], _config["pose3d_scaling"]) test_data = Mpi3dTestDataset(_config["pose2d_type"], _config["pose3d_scaling"], eval_frames_only=True) if _config["simple_aug"]: train_data.augment(False) assert _config["orient_norm"] == "gauss" normalizer_orient = MeanNormalizeOrient(train_data) # Load the preprocessing steps train_data.transform = None transforms_train = [ decode_trfrm(_config["preprocess_2d"], globals())(train_data, cache=False), decode_trfrm(_config["preprocess_3d"], globals())(train_data, cache=False), normalizer_orient, ] normalizer2d = transforms_train[0].normalizer normalizer3d = transforms_train[1].normalizer transforms_test = [ decode_trfrm(_config["preprocess_2d"], globals())(test_data, normalizer2d), decode_trfrm(_config["preprocess_3d"], globals())(test_data, normalizer3d), normalizer_orient, ] transforms_train.append(RemoveIndex()) transforms_test.append(RemoveIndex()) train_data.transform = SaveableCompose(transforms_train) test_data.transform = SaveableCompose(transforms_test) # save normalisation params save(output_path + "/preprocess_params.pkl", train_data.transform.state_dict()) len_train = len(train_data) len_test = len(test_data) print("Length of training data:", len_train) print("Length of test data:", len_test) exp.log_parameter("train data length", len_train) exp.log_parameter("test data length", len_test) bos = train_data[[0]]["orientation"].shape out_shape = (bos[1] * bos[2] if _config["model"]["loss"] == "orient" else MuPoTSJoints.NUM_JOINTS * 3) model = TemporalModelOptimized1f( train_data[[0]]["pose2d"].shape[-1], out_shape, _config["model"]["filter_widths"], dropout=_config["model"]["dropout"], channels=_config["model"]["channels"], layernorm=_config["model"]["layernorm"], ) test_model = TemporalModel( train_data[[0]]["pose2d"].shape[-1], out_shape, _config["model"]["filter_widths"], dropout=_config["model"]["dropout"], channels=_config["model"]["channels"], layernorm=_config["model"]["layernorm"], ) model.cuda() test_model.cuda() save(output_path + "/model_summary.txt", str(model)) pad = (model.receptive_field() - 1) // 2 train_loader = ChunkedGenerator( train_data, _config["batch_size"], pad, _config["train_time_flip"], shuffle=_config["shuffle"], ordered_batch=_config["ordered_batch"], ) tester = ModelCopyTemporalEvaluator(test_model, test_data, _config["model"]["loss"], _config["test_time_flip"], post_process3d=get_postprocessor( _config, test_data, normalizer3d), prefix="test", orient_norm=_config["orient_norm"], normalizer_orient=normalizer_orient) torch_train( exp, train_loader, model, lambda m, b: calc_loss(m, b, _config, None, None, None, normalizer_orient), # lambda m, b: calc_loss( # m, # b, # _config, # torch.tensor(normalizer2d.mean[2::3]).cuda(), # torch.tensor(normalizer2d.std[2::3]).cuda(), # torch.tensor(normalizer3d.std).cuda(), # ), _config, callbacks=[tester], ) model_path = os.path.join(output_path, "model_params.pkl") torch.save(model.state_dict(), model_path) exp.log_model("model", model_path) save( output_path + "/test_results.pkl", { "index": test_data.index, "pred": preds_from_logger(test_data, tester), "pose3d": test_data.poses3d, }, )
def run_experiment(output_path, _config, exp: Experiment): config, m = eval.load_model(_config["weights"]) # config.update(_config) config["model"].update(_config["model"]) _config["model"] = config["model"] # tmp = _config["model"]["loss"] # _config["model"]["loss"] = "v * mse + e_smooth_small" exp.log_parameters(train.flatten_params(_config)) # _config["model"]["loss"] = tmp save(os.path.join(output_path, "config.json"), _config) ensuredir(output_path) if _config["train_data"] == "mpii_train": print("Training data is mpii-train") train_data = Mpi3dTrainDataset( _config["pose2d_type"], _config["pose3d_scaling"], _config["cap_25fps"], _config["stride"], ) elif _config["train_data"] == "mpii+muco": print("Training data is mpii-train and muco_temp concatenated") mpi_data = Mpi3dTrainDataset( _config["pose2d_type"], _config["pose3d_scaling"], _config["cap_25fps"], _config["stride"], ) muco_data = PersonStackedMucoTempDataset(_config["pose2d_type"], _config["pose3d_scaling"]) train_data = ConcatPoseDataset(mpi_data, muco_data) elif _config["train_data"].startswith("muco_temp"): train_data = PersonStackedMucoTempDataset(_config["pose2d_type"], _config["pose3d_scaling"]) test_data = Mpi3dTestDataset(_config["pose2d_type"], _config["pose3d_scaling"], eval_frames_only=True) if _config["simple_aug"]: train_data.augment(False) # Load the preprocessing steps params_path = os.path.join(LOG_PATH, _config["weights"], "preprocess_params.pkl") transform = SaveableCompose.from_file(params_path, test_data, globals()) train_data.transform = None transforms_train = [ decode_trfrm(_config["preprocess_2d"], globals())(train_data, cache=False), decode_trfrm(_config["preprocess_3d"], globals())(train_data, cache=False), ] normalizer2d = transforms_train[0].normalizer normalizer3d = transforms_train[1].normalizer transforms_test = [ decode_trfrm(_config["preprocess_2d"], globals())(test_data, normalizer2d), decode_trfrm(_config["preprocess_3d"], globals())(test_data, normalizer3d), ] transforms_train.append(RemoveIndex()) transforms_test.append(RemoveIndex()) train_data.transform = SaveableCompose(transforms_train) test_data.transform = SaveableCompose(transforms_test) # train_data.transform = SaveableCompose.from_file(params_path, train_data, globals()) # test_data.transform = SaveableCompose.from_file(params_path, test_data, globals()) # save normalisation params save(output_path + "/preprocess_params.pkl", train_data.transform.state_dict()) len_train = len(train_data) len_test = len(test_data) print("Length of training data:", len_train) print("Length of test data:", len_test) exp.log_parameter("train data length", len_train) exp.log_parameter("test data length", len_test) model = TemporalModelOptimized1f( train_data[[0]]["pose2d"].shape[-1], MuPoTSJoints.NUM_JOINTS, config["model"]["filter_widths"], dropout=config["model"]["dropout"], channels=config["model"]["channels"], layernorm=config["model"]["layernorm"], ) model.load_state_dict(m.state_dict()) test_model = TemporalModel( train_data[[0]]["pose2d"].shape[-1], MuPoTSJoints.NUM_JOINTS, config["model"]["filter_widths"], dropout=config["model"]["dropout"], channels=config["model"]["channels"], layernorm=config["model"]["layernorm"], ) model.cuda() test_model.cuda() save(output_path + "/model_summary.txt", str(model)) # normalizer2d = train_data.transform.transforms[0].normalizer # normalizer3d = train_data.transform.transforms[1].normalizer pad = (model.receptive_field() - 1) // 2 train_loader = ChunkedGenerator( train_data, _config["batch_size"], pad, _config["train_time_flip"], shuffle=_config["shuffle"], ordered_batch=_config["ordered_batch"], ) tester = ModelCopyTemporalEvaluator( test_model, test_data, config["model"]["loss"], _config["test_time_flip"], post_process3d=get_postprocessor(_config, test_data, normalizer3d), prefix="test", ) torch_train( exp, train_loader, model, lambda m, b: train.calc_loss( m, b, _config, torch.tensor(normalizer2d.mean[2::3]).cuda(), torch.tensor(normalizer2d.std[2::3]).cuda(), torch.tensor(normalizer3d.std).cuda(), ), _config, callbacks=[tester], ) model_path = os.path.join(output_path, "model_params.pkl") torch.save(model.state_dict(), model_path) exp.log_model("model", model_path)
class experiment_logger: ''' Interface for logging experiments on neptune, comet, or both. Args: log_backend, project_name) Other backends may also be added in the future Currently defined methods: add_params: add_tags: log_text: strings log_metrics: numerical values log_figure: pyplot figures stop: end logging and close connection ''' def __init__(self, log_backend, project_name): ''' Parameters ---------- log_backend : STR One of 'comet', 'neptune', 'all' project_name : STR one of available proyects ('yeast', 'jersey', 'wheat', 'debug', etc) Returns ------- None. ''' self.proj_name = project_name self.backend = log_backend #Bool indicating wether neptune logging is enabled self.neptune = log_backend == 'neptune' or log_backend == 'all' #Bool indicating wether comet logging is enabled self.comet = log_backend == 'comet' or log_backend == 'all' if self.neptune: if fing: neptune.init( "dna-i/" + project_name, api_token= 'eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vdWkubmVwdHVuZS5haSIsImFwaV91cmwiOiJodHRwczovL3VpLm5lcHR1bmUuYWkiLCJhcGlfa2V5IjoiMWYzMzhjMjItYjczNC00NzZhLWFlZTYtOTI2NzE5MzUwZmNkIn0=', proxies={ 'http': "http://httpproxy.fing.edu.uy:3128/", 'https': "http://httpproxy.fing.edu.uy:3128/", }) else: neptune.init( "dna-i/" + project_name, api_token= 'eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vdWkubmVwdHVuZS5haSIsImFwaV91cmwiOiJodHRwczovL3VpLm5lcHR1bmUuYWkiLCJhcGlfa2V5IjoiMWYzMzhjMjItYjczNC00NzZhLWFlZTYtOTI2NzE5MzUwZmNkIn0=' ) print("logging experiments on neptune project " + project_name) neptune.create_experiment() if self.comet: self.comet_experiment = Experiment( api_key="V0OXnWOi4KVNS4OkwLjdnxSgK", project_name=project_name, workspace="dna-i") print("logging experiments on comet project " + project_name) if not (self.neptune or self.comet): raise ValueError('Logging Backend NOT Available') def add_params(self, params, step=None): ''' Adds parameters to experiment log Parameters ---------- params : Dict Key-Value pairs Returns ------- None. ''' if self.neptune: for key, value in params.items(): neptune.set_property(key, value) if step is not None: neptune.set_property('step', step) if self.comet: self.comet_experiment.log_parameters(params, step=step) def add_params_torch(self, torch_model, step=None): ''' Logs torch model parameter histogram per layer Parameters ---------- torch_model : torch nn.Module Returns ------- None. ''' if self.neptune: raise NotImplementedError if self.comet: for name, param in torch_model.named_parameters(): self.comet_experiment.log_histogram_3d( param.detach().cpu().numpy().tolist(), name=name, step=step) def log_model_torch(self, model_name, torch_model, step=None): ''' Logs torch model Parameters ---------- torch_model : torch nn.Module Returns ------- None. ''' if self.neptune: raise NotImplementedError if self.comet: wpath = "./model.pt" torch.save(torch_model.state_dict(), wpath) self.comet_experiment.log_model(model_name, wpath) os.remove(wpath) def add_tags(self, tags): ''' Adds parameters to experiment log Parameters ---------- params : tags list of tags (strings) e.g.: ['tag1', 'tag2'] Returns ------- None. ''' if self.neptune: neptune.append_tag(tags) if self.comet: self.comet_experiment.add_tags(tags) def log_metrics(self, name, value, epoch=None): ''' Logging pointwise metrics Parameters ---------- name : STR Metric key value : Float/Integer/(Boolean/String) Comet also allows Boolean/string Tuples are lallowed epoch: (OPT) INT Epoch - or anything used as x axis when plotting metrics Returns ------- None. ''' if self.neptune: try: if epoch is not None: if type(value) is tuple: print("Logging tuple as r and p-value") for val, n in zip(value, [" (r)", " (p-val)"]): neptune.log_metric(name + n, epoch, y=val) else: neptune.log_metric(name, epoch, y=value) else: if type(value) is tuple: print("Logging tuple as r and p-value") for val, n in zip(value, [" (r)", " (p-val)"]): neptune.log_metric(name + n, val) else: neptune.log_metric(name, value) except: print("Metric type {} not supported by neptune.".format( type(value))) print("logging as text") self.log_text("{}".format(value), key=name) if self.comet: try: if epoch is not None: if type(value) is tuple: print("Logging tuple as r and p-value") for val, n in zip(value, [" (r)", " (p-val)"]): self.comet_experiment.log_metric(name + n, val, step=int(epoch)) else: self.comet_experiment.log_metric(name, value, epoch=epoch) else: if type(value) is tuple: print("Logging tuple as r and p-value") for val, n in zip(value, [" (r)", " (p-val)"]): self.comet_experiment.log_metric(name + n, val) else: self.comet_experiment.log_metric(name, value) except: print("Metric type {} not supported by comet.".format( type(value))) if type(value) is tuple: print("Logging tuple as x-y pairs") for idx, val in enumerate(value): self.comet_experiment.log_metric(name, val, epoch=idx) else: print("Logging as other.") self.comet_experiment.log_other(name, value) def log_text(self, string, key=None, epoch=None): ''' Logs text strings Parameters ---------- string : STR text to log key: STR log_name needed for Neptune strings epoch: INT epoch or any other index Returns ------- None. ''' if self.neptune: if type(string) is str: if key is None: print('Neptune log_name needed for logging text') print('Using a dummy name: text') neptune.log_text('text', string) if epoch is None: neptune.log_text(key, string) else: neptune.log_text(key, epoch, y=string) else: print("Wrong type: logging text must be a string") if self.comet: if type(string) is str: if key is not None: print( "Commet text logging does not support keys, prepending it to text" ) string = key + ', ' + string if epoch is None: self.comet_experiment.log_text(string) else: self.comet_experiment.log_text(string, step=epoch) else: print("Wrong type: logging text must be a string") def log_figure(self, figure=None, figure_name=None, step=None): ''' Logs pyplot figure Parameters ---------- figure : pyplot figure, optional in comet mandatory in neptune. The default is None, uses global pyplot figure. figure_name : STR, optional in comet mandatory in neptune. The default is None. step : INT, optional An index. The default is None. Returns ------- None. ''' if self.neptune: if figure is not None: if figure_name is None: print("Figure name must be given to neptune logger") print("Using dummy name: figure") figure_name = 'figure' if step is None: neptune.log_image(figure_name, figure) else: neptune.log_image(figure_name, step, y=figure) else: print("A figure must be passed to neptune logger") if self.comet: self.comet_experiment.log_figure(figure_name=figure_name, figure=figure, step=step) def stop(self): if self.neptune: neptune.stop() if self.comet: self.comet_experiment.end() def add_table(self, filename, tabular_data=None, headers=False): self.comet_experiment.log_table(filename, tabular_data, headers) def log_image(self, image=None, figure_name=None, step=None): ''' Logs pyplot figure Parameters ---------- figure : pyplot figure, optional in comet mandatory in neptune. The default is None, uses global pyplot figure. figure_name : STR, optional in comet mandatory in neptune. The default is None. step : INT, optional An index. The default is None. Returns ------- None. ''' self.log_image(image, name=figure_name, overwrite=False, image_format="png", image_scale=1.0, \ image_shape=None, image_colormap=None, image_minmax=None, image_channels="last", \ copy_to_tmp=True, step=step) def log_hist3d(self, values=None, figure_name=None, step=None): ''' Logs pyplot figure Parameters ---------- figure : pyplot figure, optional in comet mandatory in neptune. The default is None, uses global pyplot figure. figure_name : STR, optional in comet mandatory in neptune. The default is None. step : INT, optional An index. The default is None. Returns ------- None. ''' if self.neptune: print("not implemented") if self.comet: self.comet_experiment.log_histogram_3d(values, name=figure_name, step=step) def log_table(self, name=None, data=None, headers=False): ''' Parameters ---------- name : str Table name data : array, list headers : TYPE, optional wether to use headers Returns ------- None. ''' self.comet_experiment.log_table(name + '.csv', tabular_data=data, headers=headers)
class eICU_Operator(TrainingOperator): def setup(self, config): # Number of RaySGD workers self.num_workers = config.get('num_workers', 1) # Fetch the Comet ML credentials self.comet_ml_api_key = config['comet_ml_api_key'] self.comet_ml_project_name = config['comet_ml_project_name'] self.comet_ml_workspace = config['comet_ml_workspace'] self.log_comet_ml = config.get('log_comet_ml', True) self.comet_ml_save_model = config.get('comet_ml_save_model', True) # Fetch model and dataset parameters self.model_class = config.get('model', 'VanillaRNN') # Model class self.dataset_mode = config.get( 'dataset_mode', 'one hot encoded' ) # The mode in which we'll use the data, either one hot encoded or pre-embedded self.ml_core = config.get( 'ml_core', 'deep learning' ) # The core machine learning type we'll use; either traditional ML or DL self.use_delta_ts = config.get( 'use_delta_ts', False) # Indicates if we'll use time variation info self.time_window_h = config.get( 'time_window_h', 48) # Number of hours on which we want to predict mortality # Additional properties and relevant training information self.step = 0 # Number of iteration steps done so far self.print_every = config.get( 'print_every', 10) # Steps interval where the metrics are printed self.val_loss_min = np.inf # Start with an infinitely big minimum validation loss self.clip_value = config.get( 'clip_value', 0.5) # Gradient clipping value, to avoid exploiding gradients self.features_list = config.get( 'features_list', None) # Names of the features being used in the current pipeline self.model_type = config.get( 'model_type', 'multivariate_rnn') # Type of model to train self.padding_value = config.get( 'padding_value', 999999) # Value to use in the padding, to fill the sequences self.cols_to_remove = config.get( 'cols_to_remove', [0, 1] ) # List of indices of columns to remove from the features before feeding to the model self.is_custom = config.get( 'is_custom', False) # Specifies if the model being used is a custom built one self.already_embedded = config.get( 'already_embedded', False ) # Indicates if the categorical features are already embedded when fetching a batch self.batch_size = config.get( 'batch_size', 32 ) # The number of samples used in each training, validation or test iteration self.n_epochs = config.get( 'n_epochs', 1 ) # Number of epochs, i.e. the number of times to iterate through all of the training data self.lr = config.get('lr', 0.001) # Learning rate self.models_path = config.get( 'models_path', '') # Path to the directory where the models are stored self.see_progress = config.get( 'see_progress', True ) # Sets if a progress bar is shown for each training and validation loop # Register all the hyperparameters if self.num_workers == 1: model = self.model else: # Get the original model, as the current one is wrapped in DistributedDataParallel model = self.model.module model_args = inspect.getfullargspec(model.__init__).args[1:] self.hyper_params = dict([(param, getattr(model, param)) for param in model_args]) self.hyper_params.update({ 'batch_size': self.batch_size, 'n_epochs': self.n_epochs, 'learning_rate': self.lr }) if self.log_comet_ml is True: # Create a new Comet.ml experiment self.experiment = Experiment( api_key=self.comet_ml_api_key, project_name=self.comet_ml_project_name, workspace=self.comet_ml_workspace, auto_param_logging=False, auto_metric_logging=False, auto_output_logging=False) self.experiment.log_other('completed', False) self.experiment.log_other('random_seed', du.random_seed) # Report hyperparameters to Comet.ml self.experiment.log_parameters(self.hyper_params) self.experiment.log_parameters(config) if self.features_list is not None: # Log the names of the features being used self.experiment.log_other('features_list', self.features_list) if self.clip_value is not None: # Set gradient clipping to avoid exploding gradients for p in self.model.parameters(): p.register_hook(lambda grad: torch.clamp( grad, -self.clip_value, self.clip_value)) def set_model_filename(self, val_loss): # Start with the model class name if self.model_class == 'VanillaRNN': model_filename = 'rnn' elif self.model_class == 'VanillaLSTM': model_filename = 'lstm' elif self.model_class == 'TLSTM': model_filename = 'tlstm' elif self.model_class == 'MF1LSTM': model_filename = 'mf1lstm' elif self.model_class == 'MF2LSTM': model_filename = 'mf2lstm' else: raise Exception( f'ERROR: {self.model_class} is an invalid model type. Please specify either "VanillaRNN", "VanillaLSTM", "TLSTM", "MF1LSTM" or "MF2LSTM".' ) # Add dataset mode information if self.dataset_mode == 'pre-embedded': model_filename = model_filename + '_pre_embedded' elif self.dataset_mode == 'learn embedding': model_filename = model_filename + '_with_embedding' elif self.dataset_mode == 'one hot encoded': model_filename = model_filename + '_one_hot_encoded' # Use of time variation information if self.use_delta_ts is not False and (self.model_class == 'VanillaRNN' or self.model_class == 'VanillaLSTM'): model_filename = model_filename + '_delta_ts' # Add the validation loss and timestamp current_datetime = datetime.now().strftime('%d_%m_%Y_%H_%M') model_filename = f'{val_loss:.4f}_valloss_{model_filename}_{current_datetime}.pth' return model_filename @override(TrainingOperator) def validate(self, val_iterator, info): # Number of iteration steps done so far step = info.get('step', 0) # Initialize the validation metrics val_loss = 0 val_acc = 0 val_auc = list() if self.num_workers == 1: model = self.model else: # Get the original model, as the current one is wrapped in DistributedDataParallel model = self.model.module if model.n_outputs > 1: val_auc_wgt = list() # Loop through the validation data for features, labels in du.utils.iterations_loop( val_iterator, see_progress=self.see_progress, desc='Val batches'): # Turn off gradients for validation, saves memory and computations with torch.no_grad(): if self.is_custom is False: # Find the original sequence lengths seq_lengths = du.search_explore.find_seq_len( labels, padding_value=self.padding_value) else: # No need to find the sequence lengths now seq_lengths = None if self.use_gpu is True: # Move data to GPU features, labels = features.to(self.device), labels.to( self.device) # Do inference on the data if self.model_type.lower() == 'multivariate_rnn': (pred, correct_pred, scores, labels, loss) = (du.deep_learning.inference_iter_multi_var_rnn( self.model, features, labels, padding_value=self.padding_value, cols_to_remove=self.cols_to_remove, is_train=False, prob_output=True, is_custom=self.is_custom, already_embedded=self.already_embedded, seq_lengths=seq_lengths, distributed_train=(self.num_workers > 1))) elif self.model_type.lower() == 'mlp': pred, correct_pred, scores, loss = ( du.deep_learning.inference_iter_mlp( self.model, features, labels, self.cols_to_remove, is_train=False, prob_output=True)) else: raise Exception( f'ERROR: Invalid model type. It must be "multivariate_rnn" or "mlp", not {self.model_type}.' ) val_loss += loss # Add the validation loss of the current batch val_acc += torch.mean( correct_pred.type(torch.FloatTensor) ) # Add the validation accuracy of the current batch, ignoring all padding values if self.use_gpu is True: # Move data to CPU for performance computations scores, labels = scores.cpu(), labels.cpu() # Add the training ROC AUC of the current batch if model.n_outputs == 1: try: val_auc.append( roc_auc_score(labels.numpy(), scores.detach().numpy())) except Exception as e: warnings.warn( f'Couldn\'t calculate the validation AUC on step {step}. Received exception "{str(e)}".' ) else: # It might happen that not all labels are present in the current batch; # as such, we must focus on the ones that appear in the batch labels_in_batch = labels.unique().long() try: val_auc.append( roc_auc_score(labels.numpy(), softmax(scores[:, labels_in_batch], dim=1).detach().numpy(), multi_class='ovr', average='macro', labels=labels_in_batch.numpy())) # Also calculate a weighted version of the AUC; important for imbalanced dataset val_auc_wgt.append( roc_auc_score(labels.numpy(), softmax(scores[:, labels_in_batch], dim=1).detach().numpy(), multi_class='ovr', average='weighted', labels=labels_in_batch.numpy())) except Exception as e: warnings.warn( f'Couldn\'t calculate the validation AUC on step {step}. Received exception "{str(e)}".' ) # Remove the current features and labels from memory del features del labels # Calculate the average of the metrics over the batches val_loss = val_loss / len(val_iterator) val_acc = val_acc / len(val_iterator) val_auc = np.mean(val_auc) if model.n_outputs > 1: val_auc_wgt = np.mean(val_auc_wgt) # Return the validation metrics metrics = dict(val_loss=val_loss, val_acc=val_acc, val_auc=val_auc) if model.n_outputs > 1: metrics['val_auc_wgt'] = val_auc_wgt return metrics @override(TrainingOperator) def train_epoch(self, iterator, info): if self.num_workers == 1: model = self.model else: # Get the original model, as the current one is wrapped in DistributedDataParallel model = self.model.module print(f'DEBUG: TrainingOperator attributes:\n{vars(self)}') print(f'DEBUG: Model\'s attributes:\n{vars(model)}') # Register the current epoch epoch = info.get('epoch_idx', 0) # Number of iteration steps done so far step = info.get('step', 0) # Initialize the training metrics train_loss = 0 train_acc = 0 train_auc = list() if model.n_outputs > 1: train_auc_wgt = list() # try: # Loop through the training data for features, labels in du.utils.iterations_loop( iterator, see_progress=self.see_progress, desc='Steps'): # Activate dropout to train the model self.model.train() # Clear the gradients of all optimized variables self.optimizer.zero_grad() if self.is_custom is False: # Find the original sequence lengths seq_lengths = du.search_explore.find_seq_len( labels, padding_value=self.padding_value) else: # No need to find the sequence lengths now seq_lengths = None if self.use_gpu is True: # Move data to GPU features, labels = features.to(self.device), labels.to( self.device) # Do inference on the data if self.model_type.lower() == 'multivariate_rnn': (pred, correct_pred, scores, labels, step_train_loss) = ( du.deep_learning.inference_iter_multi_var_rnn( self.model, features, labels, padding_value=self.padding_value, cols_to_remove=self.cols_to_remove, is_train=True, prob_output=True, optimizer=self.optimizer, is_custom=self.is_custom, already_embedded=self.already_embedded, seq_lengths=seq_lengths, distributed_train=(self.num_workers > 1))) elif self.model_type.lower() == 'mlp': pred, correct_pred, scores, step_train_loss = (du.deep_learning.inference_iter_mlp( self.model, features, labels, self.cols_to_remove, is_train=True, prob_output=True, optimizer=self.optimizer)) else: raise Exception( f'ERROR: Invalid model type. It must be "multivariate_rnn" or "mlp", not {self.model_type}.' ) # Add the training loss and accuracy of the current batch train_loss += step_train_loss step_train_acc = torch.mean(correct_pred.type(torch.FloatTensor)) train_acc += step_train_acc if self.use_gpu is True: # Move data to CPU for performance computations scores, labels = scores.cpu(), labels.cpu() # Add the training ROC AUC of the current batch if model.n_outputs == 1: try: step_train_auc = roc_auc_score(labels.numpy(), scores.detach().numpy()) train_auc.append(step_train_auc) except Exception as e: warnings.warn( f'Couldn\'t calculate the training AUC on step {step}. Received exception "{str(e)}".' ) step_train_auc = None else: # It might happen that not all labels are present in the current batch; # as such, we must focus on the ones that appear in the batch labels_in_batch = labels.unique().long() try: step_train_auc = roc_auc_score( labels.numpy(), softmax(scores[:, labels_in_batch], dim=1).detach().numpy(), multi_class='ovr', average='macro', labels=labels_in_batch.numpy()) train_auc.append(step_train_auc) # Also calculate a weighted version of the AUC; important for imbalanced dataset step_train_auc_wgt = roc_auc_score( labels.numpy(), softmax(scores[:, labels_in_batch], dim=1).detach().numpy(), multi_class='ovr', average='weighted', labels=labels_in_batch.numpy()) train_auc_wgt.append(step_train_auc_wgt) except Exception as e: warnings.warn( f'Couldn\'t calculate the training AUC on step {step}. Received exception "{str(e)}".' ) step_train_auc = None step_train_auc_wgt = None # Count one more iteration step step += 1 info['step'] = step # Deactivate dropout to test the model self.model.eval() # Remove the current features and labels from memory del features del labels # Run the current model on the validation set val_metrics = self.validate(self.validation_loader, info) if self.log_comet_ml is True: # Upload the current step's metrics to Comet ML self.experiment.log_metric('train_loss', step_train_loss, step=step) self.experiment.log_metric('train_acc', step_train_acc, step=step) self.experiment.log_metric('train_auc', step_train_auc, step=step) self.experiment.log_metric('val_loss', val_metrics['val_loss'], step=step) self.experiment.log_metric('val_acc', val_metrics['val_acc'], step=step) self.experiment.log_metric('val_auc', val_metrics['val_auc'], step=step) if model.n_outputs > 1: self.experiment.log_metric('train_auc_wgt', step_train_auc_wgt, step=step) self.experiment.log_metric('val_auc_wgt', val_metrics['val_auc_wgt'], step=step) # Display validation loss if step % self.print_every == 0: print( f'Epoch {epoch} step {step}: Validation loss: {val_metrics["val_loss"]}; Validation Accuracy: {val_metrics["val_acc"]}; Validation AUC: {val_metrics["val_auc"]}' ) # Check if the performance obtained in the validation set is the best so far (lowest loss value) if val_metrics['val_loss'] < self.val_loss_min: print( f'New minimum validation loss: {self.val_loss_min} -> {val_metrics["val_loss"]}.' ) # Update the minimum validation loss self.val_loss_min = val_metrics['val_loss'] # Filename and path where the model will be saved model_filename = self.set_model_filename( val_metrics['val_loss']) print(f'Saving model in {model_filename}') # Save the best performing model so far, along with additional information to implement it checkpoint = self.hyper_params checkpoint['state_dict'] = self.model.state_dict() torch.save(checkpoint, model_filename) # [TODO] Check if this really works locally or if it just saves in the temporary nodes # self.save(checkpoint, f'{self.models_path}{model_filename}') if self.log_comet_ml is True and self.comet_ml_save_model is True: # Upload the model to Comet.ml self.experiment.log_model(name=model_filename, file_or_folder=model_filename, overwrite=True) # except Exception as e: # warnings.warn(f'There was a problem doing training epoch {epoch}. Ending current epoch. Original exception message: "{str(e)}"') # try: # Calculate the average of the metrics over the epoch train_loss = train_loss / len(iterator) train_acc = train_acc / len(iterator) train_auc = np.mean(train_auc) if model.n_outputs > 1: train_auc_wgt = np.mean(train_auc_wgt) # Remove attached gradients so as to be able to print the values train_loss, val_loss = train_loss.detach( ), val_metrics['val_loss'].detach() if self.use_gpu is True: # Move metrics data to CPU train_loss, val_loss = train_loss.cpu(), val_loss.cpu() if self.log_comet_ml is True: # Upload the current epoch's metrics to Comet ML self.experiment.log_metric('train_loss', train_loss, epoch=epoch) self.experiment.log_metric('train_acc', train_acc, epoch=epoch) self.experiment.log_metric('train_auc', train_auc, epoch=epoch) self.experiment.log_metric('val_loss', val_loss, epoch=epoch) self.experiment.log_metric('val_acc', val_metrics['val_acc'], epoch=epoch) self.experiment.log_metric('val_auc', val_metrics['val_auc'], epoch=epoch) self.experiment.log_epoch_end(epoch, epoch=step) if model.n_outputs > 1: self.experiment.log_metric('train_auc_wgt', train_auc_wgt, epoch=epoch) self.experiment.log_metric('val_auc_wgt', val_metrics['val_auc_wgt'], epoch=epoch) # Print a report of the epoch print( f'Epoch {epoch}: Training loss: {train_loss}; Training Accuracy: {train_acc}; Training AUC: {train_auc}; \ Validation loss: {val_loss}; Validation Accuracy: {val_metrics["val_acc"]}; Validation AUC: {val_metrics["val_auc"]}' ) print('----------------------') # except Exception as e: # warnings.warn(f'There was a problem printing metrics from epoch {epoch}. Original exception message: "{str(e)}"') # Return the training metrics metrics = dict(train_loss=train_loss, train_acc=train_acc, train_auc=train_auc) if model.n_outputs > 1: metrics['train_auc_wgt'] = train_auc_wgt return metrics