Ejemplo n.º 1
0
def train_keras_model(dataset):
    tf.debugging.set_log_device_placement(True)

    # set tf seed
    seed_value = sigopt.get_parameter('random_seed', default=1)
    tf.compat.v1.set_random_seed(seed_value)

    print("loading and transforming data")
    load_transform_data = LoadTransformData()
    trainX, testX, trainY, testY = load_transform_data.load_split_dataset(
        dataset)
    scaled_trainX, scaled_testX = load_transform_data.scale_dataset(
        trainX, testX)

    # logging to sigopt Run
    sigopt.log_model("keras.Sequential")  # model_keras.__class__
    sigopt.log_dataset('Scaled Anomaly detection')
    sigopt.log_metadata('Training Records', len(scaled_trainX))
    sigopt.log_metadata('Testing Reccords', len(scaled_testX))
    sigopt.log_metadata("Platform", platform.uname())

    learning_rate = sigopt.get_parameter('learning_rate', default=0.01)
    loss_fn = sigopt.get_parameter('loss_function',
                                   default='binary_crossentropy')
    batch_size = sigopt.get_parameter('batch_size', default=4096)
    sigopt.get_parameter('layers',
                         3)  # tracking number of layers to SigOpt Run
    num_epochs = sigopt.get_parameter('epochs', default=6)

    keras_model = KerasModel()
    model_keras = keras_model.create_model(trainX)
    model_keras.compile(optimizer=Adam(lr=learning_rate),
                        loss=loss_fn,
                        metrics=[tf.keras.metrics.AUC()])

    model_keras.fit(
        scaled_trainX,
        trainY,
        batch_size=batch_size,
        epochs=num_epochs,
        callbacks=[CheckpointCB()],
        validation_data=(scaled_testX, testY),
    )

    # Collect model metrics
    start = time.perf_counter()
    probability = model_keras.predict(scaled_testX).flatten()
    prediction = probability > 0.5

    sigopt.log_metric('Inference Time', time.perf_counter() - start)
    log_inference_metrics(prediction, probability, testY, testX)
Ejemplo n.º 2
0
def train_xgboost_model(dataset, random_state=1):
    print("loading and transforming data")
    load_transform_data = LoadTransformData()
    trainX, testX, trainY, testY = load_transform_data.load_split_dataset(
        dataset)

    # model architecture
    sigopt.log_model("XGBClassifier")  # model_keras.__class__
    sigopt.log_dataset('Unscaled')
    sigopt.log_metadata('Training Records', len(trainX))
    sigopt.log_metadata('Testing Reccords', len(testX))
    sigopt.log_metadata("Platform", platform.uname())

    parameters = {
        'objective': 'binary:logistic',
        'learning_rate': sigopt.get_parameter('learning_rate', default=0.3),
        'n_estimators': sigopt.get_parameter('n_estimators', default=20),
        'max_depth': sigopt.get_parameter('max_depth', default=5),
        'gamma': sigopt.get_parameter('gamma', default=0),
        'min_child_weight': sigopt.get_parameter('min_child_weight',
                                                 default=1),
        'random_state': random_state,
        'importance_type': 'gain',
        'missing': None,
        'verbosity': 2
    }

    model = XGBClassifier(**parameters)

    modelfit = model.fit(trainX, trainY)

    # Collect model metrics
    start = time.perf_counter()
    prediction = modelfit.predict(testX)
    sigopt.log_metric("Inference Time", time.perf_counter() - start)
    probability = modelfit.predict_proba(testX)[:, 1]
    log_inference_metrics(prediction, probability, testY, testX)
Ejemplo n.º 3
0
    def train_model(self, training_data, validation_data, number_of_labels):
        """Defines training for tuning of pretrained model.
        Training_data and validation_data are both objects of type DataLoader."""

        logging.info("starting training process")
        logging.info("device being used: %s", device)
        logging.info("training data size: %d", len(training_data.dataset))
        logging.info("validation data size: %d", len(validation_data.dataset))

        logging.info("training data label, unique count: %s",
                     training_data.dataset.get_label_unique_count())
        logging.info("training data label, percentage: %s",
                     training_data.dataset.get_class_distribution())

        logging.info("validation data label, unique count: %s",
                     validation_data.dataset.get_label_unique_count())
        logging.info("validation data label, percentage: %s",
                     validation_data.dataset.get_class_distribution())

        validation_accuracy = 0.0

        for epoch in range(
                self.epochs):  # loop over the dataset multiple times
            logging.info("epoch number: %d", epoch)
            running_training_loss = 0.0
            running_training_correct_count = 0

            # used for model checkpointing
            # training_loss = None

            all_training_labels = []
            all_training_predictions = []

            self.model.train()

            for i, data in enumerate(training_data):
                inputs = data[StanfordCars.TRANSFORMED_IMAGE]
                labels = data[StanfordCars.LABEL]
                inputs = inputs.to(device)
                labels = labels.to(device)

                training_loss, training_preds = self.training_pass(
                    inputs, labels, True)

                all_training_predictions.extend(training_preds.tolist())
                all_training_labels.extend(labels.tolist())

                correct_count = torch.sum(training_preds == labels.data)
                running_training_loss += training_loss.item()
                running_training_correct_count += correct_count
                logging.debug("fraction of training data processed: %f",
                              (float(i) / len(training_data)) * 100)
                logging.debug("batch running training loss: %f",
                              running_training_loss)
                logging.debug("batch running training accuracy: %f",
                              running_training_correct_count.item())

            # calculating loss and accuracy over an epoch
            logging.info(
                'Epoch: {} Weigthed F1-Score: {:.4f}, Loss: {:.4f} Acc: {:.4f} '
                .format(
                    "training",
                    f1_score(y_true=all_training_labels,
                             y_pred=all_training_predictions,
                             average='weighted'),
                    running_training_loss / len(training_data.dataset),
                    (running_training_correct_count.double() /
                     len(training_data.dataset)).item()))

            self.learning_rate_scheduler.step(running_training_loss /
                                              len(training_data.dataset))

            for param_group in self.gd_optimizer.param_groups:
                logging.debug("current learning rate: %f", param_group['lr'])

            if self.model_checkpointing is not None:
                if epoch % self.model_checkpointing == 0 or epoch == self.epochs - 1:
                    self.checkpoint_model(epoch,
                                          running_training_loss /
                                          len(training_data.dataset),
                                          epithet='')

            if epoch % self.validation_frequency == 0 or epoch == self.epochs - 1:

                logging.info("validating model")

                self.model.eval()

                running_validation_loss = 0.0
                running_validation_correct_count = 0

                all_validation_labels = []
                all_validation_predictions = []

                # run forward pass on validation dataset
                for i, data in enumerate(validation_data):
                    validation_input = data[StanfordCars.TRANSFORMED_IMAGE]
                    validation_input = validation_input.to(device)
                    validation_labels = data[StanfordCars.LABEL]
                    validation_labels = validation_labels.to(device)

                    validation_loss, validation_predictions = self.training_pass(
                        validation_input, validation_labels, False)

                    all_validation_predictions.extend(
                        validation_predictions.tolist())
                    all_validation_labels.extend(validation_labels.tolist())

                    validation_correct_counts = torch.sum(
                        validation_predictions == validation_labels.data)
                    running_validation_loss += validation_loss.item()
                    running_validation_correct_count += validation_correct_counts
                    logging.debug("fraction of validation data processed: %f",
                                  (float(i) / len(validation_data)) * 100)
                    logging.debug("batch running validation loss: %f",
                                  running_validation_loss)
                    logging.debug("batch running validation accuracy: %f",
                                  running_validation_correct_count.item())

                cm = confusion_matrix(y_true=all_validation_labels,
                                      y_pred=all_validation_predictions,
                                      labels=list(range(number_of_labels)))
                cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
                logging.info("confusion matrix:\n %s", cm)

                # Calculating loss over 1 epoch (all data)
                validation_f1_score = f1_score(
                    y_true=all_validation_labels,
                    y_pred=all_validation_predictions,
                    average='weighted')
                validation_accuracy = (
                    running_validation_correct_count.double() /
                    len(validation_data.dataset)).item()
                logging.info(
                    'Epoch: {} F1-Score: {:.4f}, Loss: {:.4f} Acc: {:.4f}'.
                    format(
                        "validation", validation_f1_score,
                        running_validation_loss / len(validation_data.dataset),
                        validation_accuracy))

        # orchestrate hook to keep track of metric
        sigopt.log_metric('accuracy', validation_accuracy)

        logging.info('Finished Training')

        return self.model, validation_accuracy
Ejemplo n.º 4
0
def log_inference_metrics(prediction, probabilities, testY, testX):
    """Log all relevant metrics using the `predictions` generated by the model,
    the `probabilities` associated with those predictions, the `testY` actual
    labels from the dataset, and `testX` the features."""
    F1score = f1_score(testY, prediction)
    AUPRC = average_precision_score(testY, probabilities)
    tn, fp, fn, tp = confusion_matrix(testY, prediction).ravel()

    sigopt.log_metric('AUPRC test',
                      average_precision_score(testY, probabilities))
    sigopt.log_metric('F1score test', F1score)
    sigopt.log_metric('False Positive test', fp)
    sigopt.log_metric('False Negative test', fn)
    sigopt.log_metric('True Positive test', tp)
    sigopt.log_metric('True Negative test', tn)
    sigopt.log_metric('Max $ Missed Fraudulent',
                      max_missed_fraud(prediction, testY, testX['amount']))
    sigopt.log_metric('Max $ Missed Valid',
                      max_missed_valid(prediction, testY, testX['amount']))

    return F1score, AUPRC, tn, fp, fn, tp
Ejemplo n.º 5
0
# model.py  
import sklearn.datasets 
import sklearn.metrics 
from xgboost import XGBClassifier 
import sigopt 
 
# Data preparation required to run and evaluate the sample model 
X, y = sklearn.datasets.load_iris(return_X_y=True) 
Xtrain, ytrain = X[:100], y[:100] 

# Track the name of the dataset used for your Run 
sigopt.log_dataset('iris 2/3 training, full test') 
# Set n_estimators as the hyperparameter to explore for your Experiment 
sigopt.params.setdefault("n_estimators", 100) 
# Track the name of the model used for your Run 
sigopt.log_model('xgboost') 

# Instantiate and train your sample model 
model = XGBClassifier( 
  n_estimators=sigopt.params.n_estimators, 
  use_label_encoder=False, 
  eval_metric='logloss', 
) 
model.fit(Xtrain, ytrain) 
pred = model.predict(X) 

# Track the metric value and metric name for each Run 
sigopt.log_metric("accuracy", sklearn.metrics.accuracy_score(pred, y))