Esempio n. 1
0
def main():
    today = date.today()
    x_train, y_train, x_test, y_test = get_dataset()
    config = {
        "algorithm": "bayes",
        "name": "Optimize MNIST Network",
        "spec": {
            "maxCombo": 10,
            "objective": "minimize",
            "metric": "loss"
        },
        "parameters": {
            "first_layer_units": {
                "type": "integer",
                "mu": 500,
                "sigma": 50,
                "scalingType": "normal",
            },
            "batch_size": {
                "type": "discrete",
                "values": [64, 128, 256]
            },
        },
        "trials": 1,
    }

    experiment_class = comet_ml.Experiment
    opt = comet_ml.Optimizer(config, experiment_class)
    experiment_kwargs = {}
    date_str = today.strftime("%Y_%m_%d")

    for experiment in opt.get_experiments(**experiment_kwargs):
        experiment.log_parameter("epochs", 3)
        experiment.log_html("<div> Some Html </div>")
        for i in range(2):
            experiment.log_html("<br>Code: " + str(i))
        # experiment.log_system_info("someKey", "someValue")
        # experiment.log_system_info("someSystemKey", "someSystemValue")
        experiment.add_tag(date_str)
        #experiment.log_text()

        experiment.log_image("/Users/daven/Downloads/greenSquare.jpg",
                             "my-image",
                             step=1)
        experiment.log_image("/Users/daven/Downloads/redSquare.jpg",
                             "my-image",
                             step=2)
        experiment.log_image("/Users/daven/Downloads/blueSquare.jpg",
                             "my-image",
                             step=3)
        experiment.log_image("/Users/daven/Downloads/yellowSquare.jpg",
                             "my-image",
                             step=4)

        model = build_model_graph(experiment)

        train(experiment, model, x_train, y_train, x_test, y_test)

        evaluate(experiment, model, x_test, y_test)
Esempio n. 2
0
def tune_hyperparams():
    api_key = "w7QuiECYXbNiOozveTpjc9uPg"
    optimizer = comet_ml.Optimizer(api_key)

    # hyperparameters in PCS format
    params = """
    x integer [1, 10] [10]
    """
    optimizer.set_params(params)

    while True:
        suggestion = optimizer.get_suggestion()
        experiment = Experiment(api_key, project_name="project1-ac2g",
                workspace="ift6135")
        score = train(suggestion["x"])
        suggestion.report_score("accuracy", score)
Esempio n. 3
0
def optimize_hyperparameters(Model,
                             comet_ml_api_key,
                             comet_ml_project_name,
                             comet_ml_workspace,
                             config_name=None,
                             config_dict=None,
                             df=None,
                             dataset=None,
                             train_dataloader=None,
                             val_dataloader=None,
                             test_dataloader=None,
                             n_inputs=None,
                             id_column=None,
                             label_column=None,
                             inst_column=None,
                             id_columns_idx=None,
                             n_outputs=1,
                             Dataset=None,
                             model_type='multivariate_rnn',
                             is_custom=False,
                             models_path='models/',
                             model_name='checkpoint',
                             array_param=None,
                             metrics=['loss', 'accuracy', 'AUC'],
                             config_path='',
                             var_seq=True,
                             clip_value=0.5,
                             padding_value=999999,
                             batch_size=32,
                             n_epochs=10,
                             lr=0.001,
                             test_train_ratio=0.2,
                             validation_ratio=0.1,
                             comet_ml_save_model=True,
                             already_embedded=False,
                             verbose=False,
                             see_progress=True,
                             **kwargs):
    '''Optimize a machine learning model's hyperparameters, by training it
    several times while exploring different hyperparameters values, returning
    the best performing ones.

    Parameters
    ----------
    Model : torch.nn.Module or sklearn.* (any machine learning model)
        Class constructor for the desired machine learning model.
    comet_ml_api_key : string
        Comet.ml API key used when logging data to the platform.
    comet_ml_project_name : string
        Name of the comet.ml project used when logging data to the platform.
    comet_ml_workspace : string
        Name of the comet.ml workspace used when logging data to the platform.
    config_name : str, default None
        Name of the configuration file, containing information about the
        parameters to optimize. This data is organized in a YAML format, akin to
        a dictionary object, where the optimization algorithm is set, each
        hyperparameter gets a key with its name, followed by a list of values in
        the order of (minimum value to explore in the optimization, maximum
        value to explore in the optimization, initial value to use), and the
        metric to be optimized.
    config_dict : dict, default None
        Already loaded configuration file, containing information about the
        parameters to optimize. This data is organized in a YAML format, akin to
        a dictionary object, where the optimization algorithm is set, each
        hyperparameter gets a key with its name, followed by a list of values in
        the order of (minimum value to explore in the optimization, maximum
        value to explore in the optimization, initial value to use), and the
        metric to be optimized.
    df : pandas.DataFrame or dask.DataFrame, default None
        Dataframe containing all the data that will be used in the
        optimization's training processes.
    train_dataloader : torch.utils.data.DataLoader, default None
        Data loader which will be used to get data batches during training. If
        not specified, the method will create one automatically.
    val_dataloader : torch.utils.data.DataLoader, default None
        Data loader which will be used to get data batches when evaluating
        the model's performance on a validation set during training. If not
        specified, the method will create one automatically.
    test_dataloader : torch.utils.data.DataLoader, default None
        Data loader which will be used to get data batches whe evaluating
        the model's performance on a test set, after finishing the
        training process If not specified, the method will create one
        automatically.
    dataset : torch.utils.data.Dataset, default None
        Dataset object that contains the data used to train, validate and test
        the machine learning models. Having the dataloaders set, this argument
        is only needed if the data has variable sequence length and its dataset
        object loads files in each batch, instead of data from a single file.
        In essence, it's needed to give us the current batch's sequence length
        information, when we couldn't have known this for the whole data
        beforehand. If not specified, the method will create one automatically.
    n_inputs : int, default None
        Total number of input features present in the dataframe.
    id_column : str, default None
        Name of the column which corresponds to the subject identifier.
    label_column : str, default None
        Name of the column which corresponds to the label.
    inst_column : str, default None
        Name of the column which corresponds to the instance or timestamp
        identifier.
    id_columns_idx : int or list of ints, default None
        Index or list of indices of columns to remove from the features before
        feeding to the model. This tend to be the identifier columns, such as
        `subject_id` and `ts` (timestamp).
    n_outputs : int, default 1
        Total number of outputs givenm by the machine learning model.
    Dataset : torch.torch.utils.data.Dataset, default None
        Class constructor for the dataset, which will be used for iterating
        through batches of data. It must be able to receive as inputs a PyTorch
        tensor and a Pandas or Dask dataframe.
    model_type : string, default 'multivariate_rnn'
        Sets the type of model to train. Important to know what type of
        inference to do. Currently available options are ['multivariate_rnn',
        'mlp'].
    is_custom : bool, default False
        If set to True, the method will assume that the model being used is a
        custom built one, which won't require sequence length information during
        the feedforward process.
    models_path : string, default 'models/'
        Path where the model will be saved. By default, it saves in
        the directory named "models".
    model_name : string, default 'checkpoint'
        Name that will be given to the saved models. Validation loss and
        timestamp info will then be appended to the name.
    array_param : list of strings, default None
        List of feature names that might have multiple values associated to
        them. For example, in a neural network with multiple layers, there
        could be multiple `n_hidden` values, each one indicating the number
        of units in each hidden layer.
    metrics : list of strings, default ['loss', 'accuracy', 'AUC'],
        List of metrics to be used to evaluate the model on the infered data.
        Available metrics are cross entropy loss (`loss`), accuracy (`accuracy`),
        AUC (`AUC`), weighted AUC (`AUC_weighted`), precision (`precision`),
        recall (`recall`) and F1 (`F1`).
    config_path : str, default ''
        Path to the directory where the configuration file is stored.
    var_seq : bool, default True
        Specifies if the data has variable sequence length. Valuable information
        if the data must be adjusted by padding.
    clip_value : int or float, default 0.5
        Gradient clipping value, which limit the maximum change in the
        model parameters, so as to avoid exploiding gradients.
    padding_value : numeric, default 999999
        Value to use in the padding, to fill the sequences.
    batch_size : int, default 32
        Defines the batch size, i.e. the number of samples used in each
        training iteration to update the model's weights.
    n_epochs : int, default 10
        Number of epochs, i.e. the number of times the training loop
        iterates through all of the training data.
    lr : float, default 0.001
        Learning rate used in the optimization algorithm.
    test_train_ratio : float, default 0.2
        Percentage of data to use for the test set.
    validation_ratio : float, default 0.1
        Percentage of training data to use for the validation set.
    comet_ml_save_model : bool, default True
        If set to True, uploads the model with the lowest validation loss
        to comet.ml when logging data to the platform.
    already_embedded : bool, default False
        If set to True, it means that the categorical features are already
        embedded when fetching a batch, i.e. there's no need to run the embedding
        layer(s) during the model's feedforward.
    verbose : bool, default False
        If set to True, a set of metrics and status indicators will be printed
        throughout training.
    see_progress : bool, default True
        If set to True, a progress bar will show up indicating the execution
        of each loop.
    kwargs : dict
        Optional additional parameters, specific to the machine learning model
        being used.

    Returns
    -------
    val_loss_min : float
        Minimum validation loss over all the optimization process.
    exp_name_min : str
        Name of the comet ml experiment with the overall minimum validation
        loss.

    [TODO] Write a small tutorial on how to write the YAML configuration file,
    based on this: https://www.comet.ml/docs/python-sdk/introduction-optimizer/
    '''
    # Only log training info to Comet.ml if the required parameters are specified
    if not (comet_ml_api_key is not None and comet_ml_project_name is not None
            and comet_ml_workspace is not None):
        raise Exception(
            'ERROR: All necessary Comet.ml parameters (comet_ml_api_key, comet_ml_project_name, comet_ml_workspace) must be correctly specified. Otherwise, the parameter optimization won\'t work.'
        )
    if config_dict is None:
        # Load the hyperparameter optimization configuration file into a dictionary
        config_file = open(f'{config_path}{config_name}', 'r')
        config_dict = yaml.load(config_file, Loader=yaml.FullLoader)
    # Get all the names of the hyperparameters that will be optimized
    params_names = list(config_dict['parameters'].keys())
    if array_param is not None:
        if isinstance(array_param, str):
            # Make sure that the array parameter names are in a list format
            array_param = [array_param]
        # Create a dictionary of lists, attributing all subparameter
        # names that belong to each array parameter
        array_subparam = dict()
        for param in array_param:
            # Add all the names of subparameters that start with the same parameter name
            array_subparam[param] = [
                subparam for subparam in params_names
                if subparam.startswith(param)
            ]
    # Create a Comet.ml parameter optimizer
    param_optimizer = comet_ml.Optimizer(config_dict,
                                         api_key=comet_ml_api_key,
                                         project_name=comet_ml_project_name,
                                         workspace=comet_ml_workspace)

    seq_len_dict = None
    if df is not None:
        if inst_column is not None and var_seq is True:
            print(
                'Building a dictionary containing the sequence length of each patient\'s time series...'
            )
            # Dictionary containing the sequence length (number of temporal events) of each sequence (patient)
            seq_len_dict = padding.get_sequence_length_dict(
                df, id_column=id_column, ts_column=inst_column)
            print('Creating a padded tensor version of the dataframe...')
            # Pad data (to have fixed sequence length) and convert into a PyTorch tensor
            data = padding.dataframe_to_padded_tensor(
                df,
                seq_len_dict=seq_len_dict,
                id_column=id_column,
                ts_column=inst_column,
                padding_value=padding_value,
                inplace=True)
        else:
            # Just convert the data into a PyTorch tensor
            data = torch.from_numpy(df.to_numpy())
        if id_columns_idx is None:
            # Find the column indices for the ID columns
            id_columns_idx = [
                search_explore.find_col_idx(df, col)
                for col in [id_column, inst_column]
            ]

    if dataset is None:
        print('Creating a dataset object...')
        # Create a Dataset object from the data tensor
        if Dataset is not None:
            dataset = Dataset(data, df)
        else:
            if model_type.lower() == 'multivariate_rnn':
                dataset = datasets.Time_Series_Dataset(
                    df,
                    data,
                    id_column=id_column,
                    ts_column=inst_column,
                    seq_len_dict=seq_len_dict)
            elif model_type.lower() == 'mlp':
                dataset = datasets.Tabular_Dataset(df, data)
            else:
                raise Exception(
                    f'ERROR: Invalid model type. It must be "multivariate_rnn" or "mlp", not {model_type}.'
                )
    if train_dataloader is None and val_dataloader is None and test_dataloader is None:
        print(
            'Distributing the data to train, validation and test sets and getting their data loaders...'
        )
        # Get the train, validation and test sets data loaders, which will allow loading batches
        train_dataloader, val_dataloader, test_dataloader = create_train_sets(
            dataset,
            test_train_ratio=test_train_ratio,
            validation_ratio=validation_ratio,
            batch_size=batch_size,
            get_indices=False)
    # Start off with a minimum validation score of infinity
    val_loss_min = np.inf

    for experiment in param_optimizer.get_experiments():
        print('Starting a new parameter optimization iteration...')
        # Get the current optimized values of the hyperparameters
        params_values = dict(
            zip(params_names,
                [experiment.get_parameter(param) for param in params_names]))
        if array_param is not None:
            for param in array_param:
                # Join the values of the subparameters
                subparam_names = array_subparam[param]
                params_values[param] = [
                    params_values[subparam] for subparam in subparam_names
                ]
                # Remove the now redundant subparameters
                for subparam in subparam_names:
                    del params_values[subparam]
        # Instantiate the model (removing the two identifier columns and the labels from the input size)
        model = Model(n_inputs=n_inputs,
                      n_outputs=n_outputs,
                      **params_values,
                      **kwargs)
        # Check if GPU (CUDA) is available
        on_gpu = torch.cuda.is_available()
        if on_gpu:
            # Move the model to the GPU
            model = model.cuda()
        print('Training the model...')
        # Train the model and get the minimum validation loss
        model, val_loss = deep_learning.train(
            model,
            train_dataloader,
            val_dataloader,
            test_dataloader=test_dataloader,
            dataset=dataset,
            cols_to_remove=id_columns_idx,
            model_type=model_type,
            is_custom=is_custom,
            seq_len_dict=seq_len_dict,
            batch_size=batch_size,
            n_epochs=n_epochs,
            lr=lr,
            clip_value=clip_value,
            models_path=models_path,
            model_name=model_name,
            ModelClass=Model,
            padding_value=padding_value,
            do_test=True,
            metrics=metrics,
            log_comet_ml=True,
            comet_ml_api_key=comet_ml_api_key,
            comet_ml_project_name=comet_ml_project_name,
            comet_ml_workspace=comet_ml_workspace,
            comet_ml_save_model=comet_ml_save_model,
            experiment=experiment,
            features_list=None,
            get_val_loss_min=True,
            already_embedded=already_embedded,
            verbose=verbose,
            see_progress=see_progress)
        if val_loss < val_loss_min:
            # Update optimization minimum validation loss and the corresponding
            # experiment name
            val_loss_min = val_loss
            exp_name_min = experiment.get_key()
            if verbose is True:
                print(
                    f'Achieved a new minimum validation loss of {val_loss_min} on experiment {exp_name_min}'
                )
        # Log optimization parameters
        experiment.log_parameter('n_inputs', n_inputs)
        experiment.log_parameter('n_outputs', n_outputs)
        experiment.log_parameter('clip_value', clip_value)
        experiment.log_parameter('padding_value', padding_value)
        experiment.log_parameter('batch_size', batch_size)
        experiment.log_parameter('n_epochs', n_epochs)
        experiment.log_parameter('lr', lr)
        experiment.log_parameter('test_train_ratio', test_train_ratio)
        experiment.log_parameter('validation_ratio', validation_ratio)
        experiment.log_asset(f'{config_path}{config_name}', config_name)
        experiment.log_other('param_optimizer_status',
                             param_optimizer.status())
    if verbose is True:
        print(
            f'Finished the hyperparameter optimization! The best performing experiment was {exp_name_min}, with a minimum validation loss of {val_loss_min}'
        )
    return val_loss_min, exp_name_min
                "mu": 500,
                "sigma": 50,
                "scalingType": "normal",
            },
            "batch_size": {"type": "discrete", "values": [64, 128, 256]},
        },
        "trials": 1,
    }

    {%- if cookiecutter.online_or_offline == "Online" %}
    experiment_class = comet_ml.Experiment
    {%- elif cookiecutter.online_or_offline == "Offline" %}
    experiment_class = comet_ml.OfflineExperiment
    {%- endif %}

    opt = comet_ml.Optimizer(config, experiment_class=experiment_class)

    for experiment in opt.get_experiments(**experiment_kwargs):
        # We remove the two hyperparameters that are set by optimizer
        # as we don't want to overwrite them:
        del hyper_params["first_layer_units"]
        del hyper_params["batch_size"]
        # Log the remainder:
        experiment.log_parameters(hyper_params)

        model = build_model(experiment)

        train(experiment, model, datasets)

        evaluate(experiment, model, datasets)
print('Reading the CSV data...')

# Read the cleaned dataset dataframe
ALS_df = pd.read_csv(f'{data_path}FCUL_ALS_cleaned.csv')

# Drop the unnamed index and the NIV columns
ALS_df.drop(columns=['Unnamed: 0', 'niv'], inplace=True)

# Dataset parameters
n_patients = ALS_df.subject_id.nunique()  # Total number of patients
n_inputs = len(ALS_df.columns)  # Number of input features
n_outputs = 1  # Number of outputs

if log_comet_ml:
    # Create a Comet.ml parameter optimizer:
    param_optimizer = comet_ml.Optimizer(api_key=args.comet_ml_api_key)

    # Neural network parameters to be optimized with Comet.ml
    params = """
                    n_hidden integer [500, 2000] [1052]
                    n_layers integer [1, 4] [2]
                    p_dropout real [0, 0.5] [0.2]
             """

    # Report to the optimizer the parameters that will be optimized
    param_optimizer.set_params(params)

# Maximum number of iterations to do in the parameter optimization
max_optim_iter = 100

print(