def main(): today = date.today() x_train, y_train, x_test, y_test = get_dataset() config = { "algorithm": "bayes", "name": "Optimize MNIST Network", "spec": { "maxCombo": 10, "objective": "minimize", "metric": "loss" }, "parameters": { "first_layer_units": { "type": "integer", "mu": 500, "sigma": 50, "scalingType": "normal", }, "batch_size": { "type": "discrete", "values": [64, 128, 256] }, }, "trials": 1, } experiment_class = comet_ml.Experiment opt = comet_ml.Optimizer(config, experiment_class) experiment_kwargs = {} date_str = today.strftime("%Y_%m_%d") for experiment in opt.get_experiments(**experiment_kwargs): experiment.log_parameter("epochs", 3) experiment.log_html("<div> Some Html </div>") for i in range(2): experiment.log_html("<br>Code: " + str(i)) # experiment.log_system_info("someKey", "someValue") # experiment.log_system_info("someSystemKey", "someSystemValue") experiment.add_tag(date_str) #experiment.log_text() experiment.log_image("/Users/daven/Downloads/greenSquare.jpg", "my-image", step=1) experiment.log_image("/Users/daven/Downloads/redSquare.jpg", "my-image", step=2) experiment.log_image("/Users/daven/Downloads/blueSquare.jpg", "my-image", step=3) experiment.log_image("/Users/daven/Downloads/yellowSquare.jpg", "my-image", step=4) model = build_model_graph(experiment) train(experiment, model, x_train, y_train, x_test, y_test) evaluate(experiment, model, x_test, y_test)
def tune_hyperparams(): api_key = "w7QuiECYXbNiOozveTpjc9uPg" optimizer = comet_ml.Optimizer(api_key) # hyperparameters in PCS format params = """ x integer [1, 10] [10] """ optimizer.set_params(params) while True: suggestion = optimizer.get_suggestion() experiment = Experiment(api_key, project_name="project1-ac2g", workspace="ift6135") score = train(suggestion["x"]) suggestion.report_score("accuracy", score)
def optimize_hyperparameters(Model, comet_ml_api_key, comet_ml_project_name, comet_ml_workspace, config_name=None, config_dict=None, df=None, dataset=None, train_dataloader=None, val_dataloader=None, test_dataloader=None, n_inputs=None, id_column=None, label_column=None, inst_column=None, id_columns_idx=None, n_outputs=1, Dataset=None, model_type='multivariate_rnn', is_custom=False, models_path='models/', model_name='checkpoint', array_param=None, metrics=['loss', 'accuracy', 'AUC'], config_path='', var_seq=True, clip_value=0.5, padding_value=999999, batch_size=32, n_epochs=10, lr=0.001, test_train_ratio=0.2, validation_ratio=0.1, comet_ml_save_model=True, already_embedded=False, verbose=False, see_progress=True, **kwargs): '''Optimize a machine learning model's hyperparameters, by training it several times while exploring different hyperparameters values, returning the best performing ones. Parameters ---------- Model : torch.nn.Module or sklearn.* (any machine learning model) Class constructor for the desired machine learning model. comet_ml_api_key : string Comet.ml API key used when logging data to the platform. comet_ml_project_name : string Name of the comet.ml project used when logging data to the platform. comet_ml_workspace : string Name of the comet.ml workspace used when logging data to the platform. config_name : str, default None Name of the configuration file, containing information about the parameters to optimize. This data is organized in a YAML format, akin to a dictionary object, where the optimization algorithm is set, each hyperparameter gets a key with its name, followed by a list of values in the order of (minimum value to explore in the optimization, maximum value to explore in the optimization, initial value to use), and the metric to be optimized. config_dict : dict, default None Already loaded configuration file, containing information about the parameters to optimize. This data is organized in a YAML format, akin to a dictionary object, where the optimization algorithm is set, each hyperparameter gets a key with its name, followed by a list of values in the order of (minimum value to explore in the optimization, maximum value to explore in the optimization, initial value to use), and the metric to be optimized. df : pandas.DataFrame or dask.DataFrame, default None Dataframe containing all the data that will be used in the optimization's training processes. train_dataloader : torch.utils.data.DataLoader, default None Data loader which will be used to get data batches during training. If not specified, the method will create one automatically. val_dataloader : torch.utils.data.DataLoader, default None Data loader which will be used to get data batches when evaluating the model's performance on a validation set during training. If not specified, the method will create one automatically. test_dataloader : torch.utils.data.DataLoader, default None Data loader which will be used to get data batches whe evaluating the model's performance on a test set, after finishing the training process If not specified, the method will create one automatically. dataset : torch.utils.data.Dataset, default None Dataset object that contains the data used to train, validate and test the machine learning models. Having the dataloaders set, this argument is only needed if the data has variable sequence length and its dataset object loads files in each batch, instead of data from a single file. In essence, it's needed to give us the current batch's sequence length information, when we couldn't have known this for the whole data beforehand. If not specified, the method will create one automatically. n_inputs : int, default None Total number of input features present in the dataframe. id_column : str, default None Name of the column which corresponds to the subject identifier. label_column : str, default None Name of the column which corresponds to the label. inst_column : str, default None Name of the column which corresponds to the instance or timestamp identifier. id_columns_idx : int or list of ints, default None Index or list of indices of columns to remove from the features before feeding to the model. This tend to be the identifier columns, such as `subject_id` and `ts` (timestamp). n_outputs : int, default 1 Total number of outputs givenm by the machine learning model. Dataset : torch.torch.utils.data.Dataset, default None Class constructor for the dataset, which will be used for iterating through batches of data. It must be able to receive as inputs a PyTorch tensor and a Pandas or Dask dataframe. model_type : string, default 'multivariate_rnn' Sets the type of model to train. Important to know what type of inference to do. Currently available options are ['multivariate_rnn', 'mlp']. is_custom : bool, default False If set to True, the method will assume that the model being used is a custom built one, which won't require sequence length information during the feedforward process. models_path : string, default 'models/' Path where the model will be saved. By default, it saves in the directory named "models". model_name : string, default 'checkpoint' Name that will be given to the saved models. Validation loss and timestamp info will then be appended to the name. array_param : list of strings, default None List of feature names that might have multiple values associated to them. For example, in a neural network with multiple layers, there could be multiple `n_hidden` values, each one indicating the number of units in each hidden layer. metrics : list of strings, default ['loss', 'accuracy', 'AUC'], List of metrics to be used to evaluate the model on the infered data. Available metrics are cross entropy loss (`loss`), accuracy (`accuracy`), AUC (`AUC`), weighted AUC (`AUC_weighted`), precision (`precision`), recall (`recall`) and F1 (`F1`). config_path : str, default '' Path to the directory where the configuration file is stored. var_seq : bool, default True Specifies if the data has variable sequence length. Valuable information if the data must be adjusted by padding. clip_value : int or float, default 0.5 Gradient clipping value, which limit the maximum change in the model parameters, so as to avoid exploiding gradients. padding_value : numeric, default 999999 Value to use in the padding, to fill the sequences. batch_size : int, default 32 Defines the batch size, i.e. the number of samples used in each training iteration to update the model's weights. n_epochs : int, default 10 Number of epochs, i.e. the number of times the training loop iterates through all of the training data. lr : float, default 0.001 Learning rate used in the optimization algorithm. test_train_ratio : float, default 0.2 Percentage of data to use for the test set. validation_ratio : float, default 0.1 Percentage of training data to use for the validation set. comet_ml_save_model : bool, default True If set to True, uploads the model with the lowest validation loss to comet.ml when logging data to the platform. already_embedded : bool, default False If set to True, it means that the categorical features are already embedded when fetching a batch, i.e. there's no need to run the embedding layer(s) during the model's feedforward. verbose : bool, default False If set to True, a set of metrics and status indicators will be printed throughout training. see_progress : bool, default True If set to True, a progress bar will show up indicating the execution of each loop. kwargs : dict Optional additional parameters, specific to the machine learning model being used. Returns ------- val_loss_min : float Minimum validation loss over all the optimization process. exp_name_min : str Name of the comet ml experiment with the overall minimum validation loss. [TODO] Write a small tutorial on how to write the YAML configuration file, based on this: https://www.comet.ml/docs/python-sdk/introduction-optimizer/ ''' # Only log training info to Comet.ml if the required parameters are specified if not (comet_ml_api_key is not None and comet_ml_project_name is not None and comet_ml_workspace is not None): raise Exception( 'ERROR: All necessary Comet.ml parameters (comet_ml_api_key, comet_ml_project_name, comet_ml_workspace) must be correctly specified. Otherwise, the parameter optimization won\'t work.' ) if config_dict is None: # Load the hyperparameter optimization configuration file into a dictionary config_file = open(f'{config_path}{config_name}', 'r') config_dict = yaml.load(config_file, Loader=yaml.FullLoader) # Get all the names of the hyperparameters that will be optimized params_names = list(config_dict['parameters'].keys()) if array_param is not None: if isinstance(array_param, str): # Make sure that the array parameter names are in a list format array_param = [array_param] # Create a dictionary of lists, attributing all subparameter # names that belong to each array parameter array_subparam = dict() for param in array_param: # Add all the names of subparameters that start with the same parameter name array_subparam[param] = [ subparam for subparam in params_names if subparam.startswith(param) ] # Create a Comet.ml parameter optimizer param_optimizer = comet_ml.Optimizer(config_dict, api_key=comet_ml_api_key, project_name=comet_ml_project_name, workspace=comet_ml_workspace) seq_len_dict = None if df is not None: if inst_column is not None and var_seq is True: print( 'Building a dictionary containing the sequence length of each patient\'s time series...' ) # Dictionary containing the sequence length (number of temporal events) of each sequence (patient) seq_len_dict = padding.get_sequence_length_dict( df, id_column=id_column, ts_column=inst_column) print('Creating a padded tensor version of the dataframe...') # Pad data (to have fixed sequence length) and convert into a PyTorch tensor data = padding.dataframe_to_padded_tensor( df, seq_len_dict=seq_len_dict, id_column=id_column, ts_column=inst_column, padding_value=padding_value, inplace=True) else: # Just convert the data into a PyTorch tensor data = torch.from_numpy(df.to_numpy()) if id_columns_idx is None: # Find the column indices for the ID columns id_columns_idx = [ search_explore.find_col_idx(df, col) for col in [id_column, inst_column] ] if dataset is None: print('Creating a dataset object...') # Create a Dataset object from the data tensor if Dataset is not None: dataset = Dataset(data, df) else: if model_type.lower() == 'multivariate_rnn': dataset = datasets.Time_Series_Dataset( df, data, id_column=id_column, ts_column=inst_column, seq_len_dict=seq_len_dict) elif model_type.lower() == 'mlp': dataset = datasets.Tabular_Dataset(df, data) else: raise Exception( f'ERROR: Invalid model type. It must be "multivariate_rnn" or "mlp", not {model_type}.' ) if train_dataloader is None and val_dataloader is None and test_dataloader is None: print( 'Distributing the data to train, validation and test sets and getting their data loaders...' ) # Get the train, validation and test sets data loaders, which will allow loading batches train_dataloader, val_dataloader, test_dataloader = create_train_sets( dataset, test_train_ratio=test_train_ratio, validation_ratio=validation_ratio, batch_size=batch_size, get_indices=False) # Start off with a minimum validation score of infinity val_loss_min = np.inf for experiment in param_optimizer.get_experiments(): print('Starting a new parameter optimization iteration...') # Get the current optimized values of the hyperparameters params_values = dict( zip(params_names, [experiment.get_parameter(param) for param in params_names])) if array_param is not None: for param in array_param: # Join the values of the subparameters subparam_names = array_subparam[param] params_values[param] = [ params_values[subparam] for subparam in subparam_names ] # Remove the now redundant subparameters for subparam in subparam_names: del params_values[subparam] # Instantiate the model (removing the two identifier columns and the labels from the input size) model = Model(n_inputs=n_inputs, n_outputs=n_outputs, **params_values, **kwargs) # Check if GPU (CUDA) is available on_gpu = torch.cuda.is_available() if on_gpu: # Move the model to the GPU model = model.cuda() print('Training the model...') # Train the model and get the minimum validation loss model, val_loss = deep_learning.train( model, train_dataloader, val_dataloader, test_dataloader=test_dataloader, dataset=dataset, cols_to_remove=id_columns_idx, model_type=model_type, is_custom=is_custom, seq_len_dict=seq_len_dict, batch_size=batch_size, n_epochs=n_epochs, lr=lr, clip_value=clip_value, models_path=models_path, model_name=model_name, ModelClass=Model, padding_value=padding_value, do_test=True, metrics=metrics, log_comet_ml=True, comet_ml_api_key=comet_ml_api_key, comet_ml_project_name=comet_ml_project_name, comet_ml_workspace=comet_ml_workspace, comet_ml_save_model=comet_ml_save_model, experiment=experiment, features_list=None, get_val_loss_min=True, already_embedded=already_embedded, verbose=verbose, see_progress=see_progress) if val_loss < val_loss_min: # Update optimization minimum validation loss and the corresponding # experiment name val_loss_min = val_loss exp_name_min = experiment.get_key() if verbose is True: print( f'Achieved a new minimum validation loss of {val_loss_min} on experiment {exp_name_min}' ) # Log optimization parameters experiment.log_parameter('n_inputs', n_inputs) experiment.log_parameter('n_outputs', n_outputs) experiment.log_parameter('clip_value', clip_value) experiment.log_parameter('padding_value', padding_value) experiment.log_parameter('batch_size', batch_size) experiment.log_parameter('n_epochs', n_epochs) experiment.log_parameter('lr', lr) experiment.log_parameter('test_train_ratio', test_train_ratio) experiment.log_parameter('validation_ratio', validation_ratio) experiment.log_asset(f'{config_path}{config_name}', config_name) experiment.log_other('param_optimizer_status', param_optimizer.status()) if verbose is True: print( f'Finished the hyperparameter optimization! The best performing experiment was {exp_name_min}, with a minimum validation loss of {val_loss_min}' ) return val_loss_min, exp_name_min
"mu": 500, "sigma": 50, "scalingType": "normal", }, "batch_size": {"type": "discrete", "values": [64, 128, 256]}, }, "trials": 1, } {%- if cookiecutter.online_or_offline == "Online" %} experiment_class = comet_ml.Experiment {%- elif cookiecutter.online_or_offline == "Offline" %} experiment_class = comet_ml.OfflineExperiment {%- endif %} opt = comet_ml.Optimizer(config, experiment_class=experiment_class) for experiment in opt.get_experiments(**experiment_kwargs): # We remove the two hyperparameters that are set by optimizer # as we don't want to overwrite them: del hyper_params["first_layer_units"] del hyper_params["batch_size"] # Log the remainder: experiment.log_parameters(hyper_params) model = build_model(experiment) train(experiment, model, datasets) evaluate(experiment, model, datasets)
print('Reading the CSV data...') # Read the cleaned dataset dataframe ALS_df = pd.read_csv(f'{data_path}FCUL_ALS_cleaned.csv') # Drop the unnamed index and the NIV columns ALS_df.drop(columns=['Unnamed: 0', 'niv'], inplace=True) # Dataset parameters n_patients = ALS_df.subject_id.nunique() # Total number of patients n_inputs = len(ALS_df.columns) # Number of input features n_outputs = 1 # Number of outputs if log_comet_ml: # Create a Comet.ml parameter optimizer: param_optimizer = comet_ml.Optimizer(api_key=args.comet_ml_api_key) # Neural network parameters to be optimized with Comet.ml params = """ n_hidden integer [500, 2000] [1052] n_layers integer [1, 4] [2] p_dropout real [0, 0.5] [0.2] """ # Report to the optimizer the parameters that will be optimized param_optimizer.set_params(params) # Maximum number of iterations to do in the parameter optimization max_optim_iter = 100 print(