Esempio n. 1
0
 def test_if_one_hot_encoded_correctly(self, sample: np.ndarray,
                                       label: np.ndarray, n_classes: int):
     _, tr_label = transforms.OneHotEncode(n_classes)(sample, label)
     assert len(tr_label.shape) == 2
     one_hot = np.zeros((label.size, label.max() + 1))
     one_hot[np.arange(label.size), label] = 1
     np.testing.assert_array_equal(one_hot, tr_label)
Esempio n. 2
0
def train(*,
          data,
          model_name: str,
          dest_path: str,
          sample_size: int,
          n_classes: int,
          kernel_size: int = 3,
          n_kernels: int = 16,
          n_layers: int = 1,
          lr: float = 0.005,
          batch_size: int = 150,
          epochs: int = 10,
          verbose: int = 2,
          shuffle: bool = True,
          patience: int = 3,
          seed: int = 0,
          noise: ('post', multi(min=0)),
          noise_sets: ('spost', multi(min=0)),
          noise_params: str = None):
    """
    Function for training tensorflow models given a dataset.

    :param model_name: Name of the model, it serves as a key in the
        dictionary holding all functions returning models.
    :param kernel_size: Size of ech kernel in each layer.
    :param n_kernels: Number of kernels in each layer.
    :param n_layers: Number of layers in the model.
    :param dest_path: Path to where to save the model
        under the name "model_name".
    :param sample_size: Size of the input sample.
    :param n_classes: Number of classes.
    :param lr: Learning rate for the model, i.e., regulates the size of the step
        in the gradient descent process.
    :param data: Either path to the input data or the data dict itself.
        First dimension of the dataset should be the number of samples.
    :param batch_size: Size of the batch used in training phase,
        it is the size of samples per gradient step.
    :param epochs: Number of epochs for model to train.
    :param verbose: Verbosity mode used in training, (0, 1 or 2).
    :param shuffle: Boolean indicating whether to shuffle dataset
     dataset_key each epoch.
    :param patience: Number of epochs without improvement in order to
        stop the training phase.
    :param seed: Seed for training reproducibility.
    :param noise: List containing names of used noise injection methods
        that are performed after the normalization transformations.
    :type noise: list[str]
    :param noise_sets: List of sets that are affected by
        the noise injection methods.
        For this module single element can be either "train" or "val".
    :type noise_sets: list[str]
    :param noise_params: JSON containing the parameters
        setting of injection methods.
        Exemplary value for this parameter: "{"mean": 0, "std": 1, "pa": 0.1}".
        This JSON should include all parameters for noise injection
        functions that are specified in the noise argument.
        For the accurate description of each parameter, please
        refer to the ml_intuition/data/noise.py module.
    """

    # Reproducibility
    tf.reset_default_graph()
    tf.set_random_seed(seed=seed)
    np.random.seed(seed=seed)

    if type(data) is str:
        train_dict = io.extract_set(data, enums.Dataset.TRAIN)
        val_dict = io.extract_set(data, enums.Dataset.VAL)
        min_, max_ = train_dict[enums.DataStats.MIN], \
            train_dict[enums.DataStats.MAX]
    else:
        train_dict = data[enums.Dataset.TRAIN]
        val_dict = data[enums.Dataset.VAL]
        min_, max_ = data[enums.DataStats.MIN], \
            data[enums.DataStats.MAX]

    transformations = [transforms.SpectralTransform(),
                       transforms.OneHotEncode(n_classes=n_classes),
                       transforms.MinMaxNormalize(min_=min_, max_=max_)]

    tr_transformations = transformations + get_noise_functions(noise, noise_params) \
        if enums.Dataset.TRAIN in noise_sets else transformations
    val_transformations = transformations + get_noise_functions(noise, noise_params) \
        if enums.Dataset.VAL in noise_sets else transformations

    train_dict = transforms.apply_transformations(train_dict, tr_transformations)
    val_dict = transforms.apply_transformations(val_dict, val_transformations)

    model = models.get_model(model_key=model_name, kernel_size=kernel_size,
                              n_kernels=n_kernels, n_layers=n_layers,
                              input_size=sample_size, n_classes=n_classes)
    model.summary()
    model.compile(tf.keras.optimizers.Adam(lr=lr),
                  'categorical_crossentropy',
                  metrics=['accuracy'])

    time_history = time_metrics.TimeHistory()
    mcp_save = tf.keras.callbacks.ModelCheckpoint(
        os.path.join(dest_path, model_name), save_best_only=True,
        monitor='val_acc', mode='max')
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                      patience=patience)
    callbacks = [time_history, mcp_save, early_stopping]
    history = model.fit(x=train_dict[enums.Dataset.DATA],
                        y=train_dict[enums.Dataset.LABELS],
                        epochs=epochs,
                        verbose=verbose,
                        shuffle=shuffle,
                        validation_data=(val_dict[enums.Dataset.DATA],
                                         val_dict[enums.Dataset.LABELS]),
                        callbacks=callbacks,
                        batch_size=batch_size)

    history.history[time_metrics.TimeHistory.__name__] = time_history.average
    io.save_metrics(dest_path=dest_path,
                    file_name='training_metrics.csv',
                    metrics=history.history)

    np.savetxt(os.path.join(dest_path, 'min-max.csv'),
               np.array([min_, max_]), delimiter=',', fmt='%f')
Esempio n. 3
0
def evaluate(*,
             data,
             model_path: str,
             dest_path: str,
             n_classes: int,
             batch_size: int = 1024,
             use_ensemble: bool = False,
             ensemble_copies: int = 1,
             voting: str = 'hard',
             noise: ('post', multi(min=0)),
             noise_sets: ('spost', multi(min=0)),
             noise_params: str = None,
             seed: int = 0):
    """
    Function for evaluating the trained model.

    :param model_path: Path to the model.
    :param data: Either path to the input data or the data dict.
    :param dest_path: Directory in which to store the calculated metrics.
    :param n_classes: Number of classes.
    :param batch_size: Size of the batch for inference.
    :param use_ensemble: Use ensemble for prediction.
    :param ensemble_copies: Number of model copies for the ensemble.
    :param voting: Method of ensemble voting. If ‘hard’, uses predicted class
        labels for majority rule voting. Else if ‘soft’, predicts the class
        label based on the argmax of the sums of the predicted probabilities.
    :param noise: List containing names of used noise injection methods
        that are performed after the normalization transformations.
    :type noise: list[str]
    :param noise_sets: List of sets that are affected by the noise injection.
        For this module single element can be "test".
    :type noise_sets: list[str]
    :param noise_params: JSON containing the parameters
        setting of noise injection methods.
        Exemplary value for this parameter: "{"mean": 0, "std": 1, "pa": 0.1}".
        This JSON should include all parameters for noise injection
        functions that are specified in the noise argument.
        For the accurate description of each parameter, please
        refer to the ml_intuition/data/noise.py module.
    :param seed: Seed for RNG.
    """
    os.makedirs(dest_path, exist_ok=True)
    if type(data) is str:
        test_dict = io.extract_set(data, enums.Dataset.TEST)
    else:
        test_dict = copy(data[enums.Dataset.TEST])
    min_max_path = os.path.join(os.path.dirname(model_path), "min-max.csv")
    if os.path.exists(min_max_path):
        min_value, max_value = io.read_min_max(min_max_path)
    else:
        min_value, max_value = data[enums.DataStats.MIN], \
                               data[enums.DataStats.MAX]

    transformations = [transforms.SpectralTransform(),
                       transforms.OneHotEncode(n_classes=n_classes),
                       transforms.MinMaxNormalize(min_=min_value,
                                                  max_=max_value)]
    transformations = transformations + \
                      get_noise_functions(noise, noise_params) \
        if enums.Dataset.TEST in noise_sets else transformations

    test_dict = transforms.apply_transformations(test_dict, transformations)

    model = tf.keras.models.load_model(model_path, compile=True)
    if use_ensemble:
        model = Ensemble(model, voting=voting)

        if ensemble_copies is not None:
            noise_params = yaml.load(noise_params)
            model.generate_models_with_noise(copies=ensemble_copies,
                                             mean=noise_params['mean'],
                                             seed=seed)
        if voting == 'classifier':
            if type(data) is str:
                train_dict = io.extract_set(data, enums.Dataset.TRAIN)
            else:
                train_dict = data[enums.Dataset.TRAIN]
            train_dict = transforms.apply_transformations(train_dict, transformations)
            train_probabilities = model.predict_probabilities(train_dict[enums.Dataset.DATA])
            model.train_ensemble_predictor(train_probabilities, train_dict[enums.Dataset.LABELS])

    predict = timeit(model.predict)
    y_pred, inference_time = predict(test_dict[enums.Dataset.DATA],
                                     batch_size=batch_size)

    if voting == 'classifier':
        y_pred = np.argmax(y_pred, axis=-1)
    y_true = np.argmax(test_dict[enums.Dataset.LABELS], axis=-1)

    model_metrics = get_model_metrics(y_true, y_pred)
    model_metrics['inference_time'] = [inference_time]
    conf_matrix = confusion_matrix(y_true, y_pred)
    io.save_metrics(dest_path=dest_path,
                    file_name=enums.Experiment.INFERENCE_METRICS,
                    metrics=model_metrics)
    io.save_confusion_matrix(conf_matrix, dest_path)
    if enums.Splits.GRIDS in model_path:
        if type(data) is str:
            train_dict = io.extract_set(data, enums.Dataset.TRAIN)
            labels_in_train = np.unique(train_dict[enums.Dataset.LABELS])
        else:
            train_labels = data[enums.Dataset.TRAIN][enums.Dataset.LABELS]
            if train_labels.ndim > 1:
                train_labels = np.argmax(train_labels, axis=-1)
            labels_in_train = np.unique(train_labels)
        fair_metrics = get_fair_model_metrics(conf_matrix, labels_in_train)
        io.save_metrics(dest_path=dest_path,
                        file_name=enums.Experiment.INFERENCE_FAIR_METRICS,
                        metrics=fair_metrics)
Esempio n. 4
0
def evaluate(*,
             data,
             model_path: str,
             dest_path: str,
             n_classes: int,
             batch_size: int = 1024,
             noise: ('post', multi(min=0)),
             noise_sets: ('spost', multi(min=0)),
             noise_params: str = None):
    """
    Function for evaluating the trained model.

    :param model_path: Path to the model.
    :param data: Either path to the input data or the data dict.
    :param dest_path: Directory in which to store the calculated metrics
    :param n_classes: Number of classes.
    :param batch_size: Size of the batch for inference
    :param noise: List containing names of used noise injection methods
        that are performed after the normalization transformations.
    :param noise_sets: List of sets that are affected by the noise injection.
        For this module single element can be "test".
    :param noise_params: JSON containing the parameters
        setting of noise injection methods.
        Exemplary value for this parameter: "{"mean": 0, "std": 1, "pa": 0.1}".
        This JSON should include all parameters for noise injection
        functions that are specified in the noise argument.
        For the accurate description of each parameter, please
        refer to the ml_intuition/data/noise.py module.
    """
    if type(data) is str:
        test_dict = io.extract_set(data, enums.Dataset.TEST)
    else:
        test_dict = data[enums.Dataset.TEST]
    min_max_path = os.path.join(os.path.dirname(model_path), "min-max.csv")
    if os.path.exists(min_max_path):
        min_value, max_value = io.read_min_max(min_max_path)
    else:
        min_value, max_value = data[enums.DataStats.MIN], \
                               data[enums.DataStats.MAX]

    transformations = [transforms.SpectralTransform(),
                       transforms.OneHotEncode(n_classes=n_classes),
                       transforms.MinMaxNormalize(min_=min_value, max_=max_value)]
    transformations = transformations + get_noise_functions(noise, noise_params) \
        if enums.Dataset.TEST in noise_sets else transformations

    test_dict = transforms.apply_transformations(test_dict, transformations)

    model = tf.keras.models.load_model(model_path, compile=True)

    predict = timeit(model.predict)
    y_pred, inference_time = predict(test_dict[enums.Dataset.DATA],
                                     batch_size=batch_size)

    y_pred = np.argmax(y_pred, axis=-1)
    y_true = np.argmax(test_dict[enums.Dataset.LABELS], axis=-1)

    model_metrics = get_model_metrics(y_true, y_pred)
    model_metrics['inference_time'] = [inference_time]
    conf_matrix = confusion_matrix(y_true, y_pred)
    io.save_metrics(dest_path=dest_path,
                    file_name=enums.Experiment.INFERENCE_METRICS,
                    metrics=model_metrics)
    io.save_confusion_matrix(conf_matrix, dest_path)
    if enums.Splits.GRIDS in model_path:
        if type(data) is str:
            train_dict = io.extract_set(data, enums.Dataset.TRAIN)
            labels_in_train = np.unique(train_dict[enums.Dataset.LABELS])
        else:
            train_labels = data[enums.Dataset.TRAIN][enums.Dataset.LABELS]
            if train_labels.ndim > 1:
                train_labels = np.argmax(train_labels, axis=-1)
            labels_in_train = np.unique(train_labels)
        fair_metrics = get_fair_model_metrics(conf_matrix, labels_in_train)
        io.save_metrics(dest_path=dest_path,
                        file_name=enums.Experiment.INFERENCE_FAIR_METRICS,
                        metrics=fair_metrics)