Ejemplo n.º 1
0
def test_neural_network_softmax_classification_model(
        learning_rate, steps, batch_size, hidden_units, n_classes,
        weight_column, dropout, batch_norm, optimiser, model_dir,
        testing_features, testing_targets):
    '''
    Args:
        learning rate: the learning rate (float)
        steps: total number of training steps (int)
        batch_size: batch size to used to calculate the gradient (int)
        hidden_units: number of neurons in each layrs (list)
        n_classes: number of classes (int)
        weight_column: down weight or boost examples during training for unbalanced sets
        dropout: the probability to drop out a node output (for regularisation)
        batch_norm: to use batch normalization after each hidden layer (True/False)
        optimiser: type of the optimiser (GradientDescent, ProximalGradientDescent, Adagrad, ProximalAdagrad, Adam)
        model_dir: directory to save the checkpoint ('None' if no saving)
        testing_features: one or more columns of testing features (DataFrame)
        testing_targets: a single column of testing targets (DataFrame)
        
    Returns:
        A `DNNClassifier` object trained on the training data
    '''

    # create neural network classifier object

    if optimiser == 'GradientDescent':
        my_optimiser = tf.train.GradientDescentOptimizer(
            learning_rate=learning_rate)
    elif optimiser == 'ProximalGradientDescent':
        my_optimiser = tf.train.ProximalGradientDescentOptimizer(
            learning_rate=learning_rate)
        #my_optimiser = tf.train.ProximalGradientDescentOptimizer(learning_rate = learning_rate, l1_regularization_strength = 0.1) # for L1 regularisation
        #my_optimiser = tf.train.ProximalGradientDescentOptimizer(learning_rate = learning_rate, l2_regularization_strength = 0.1) # for L2 regularisation
    elif optimiser == 'Adagrad':
        my_optimiser = tf.train.AdagradOptimizer(
            learning_rate=learning_rate)  # for convex problems
    elif optimiser == 'ProximalAdagrad':
        my_optimiser = tf.train.ProximalAdagradOptimizer(
            learning_rate=learning_rate)  # for convex problems
        #my_optimiser = tf.train.ProximalAdagradOptimizer(learning_rate = learning_rate, l1_regularization_strength = 0.1) # for L1 regularisation
        #my_optimiser = tf.train.ProximalAdagradOptimizer(learning_rate = learning_rate, l2_regularization_strength = 0.1) # for L2 regularisation
    elif optimiser == 'Adam':
        my_optimiser = tf.train.AdamOptimizer(
            learning_rate=learning_rate)  # for non-convex problems
    else:
        print('Unknown optimiser type')
    my_optimiser = tf.contrib.estimator.clip_gradients_by_norm(
        my_optimiser, 5.0)
    dnn_classifier = tf.estimator.DNNClassifier(
        feature_columns=construct_feature_columns(testing_features),
        model_dir=model_dir,
        n_classes=n_classes,
        hidden_units=hidden_units,
        weight_column=weight_column,
        optimizer=my_optimiser,
        activation_fn=tf.nn.relu,
        dropout=dropout,
        batch_norm=batch_norm)

    # define input functions

    predict_testing_input_fn = lambda: my_input_fn(
        testing_features, testing_targets, num_epochs=1, shuffle=False)

    # calculate testing predictions

    testing_predictions = list(
        dnn_classifier.predict(input_fn=predict_testing_input_fn))
    testing_probabilities = np.array(
        [item['probabilities'] for item in testing_predictions])
    testing_pred_class_id = np.array(
        [item['class_ids'][0] for item in testing_predictions])
    testing_pred_one_hot = tf.keras.utils.to_categorical(
        testing_pred_class_id, n_classes)

    # calculate loss

    testing_log_loss = metrics.log_loss(testing_targets, testing_pred_one_hot)

    # Calculate final predictions (not probabilities, as above)

    final_testing_predictions = dnn_classifier.predict(
        input_fn=predict_testing_input_fn)
    final_testing_predictions = np.array(
        [item['class_ids'][0] for item in final_testing_predictions])

    # calculate accuracy

    testing_accuracy = metrics.accuracy_score(testing_targets,
                                              final_testing_predictions)

    print('Final accuracy (on testing data): %0.2f' % testing_accuracy)

    # plot and save confusion matrix (testing)

    plt.figure(figsize=(6, 4))

    cm = metrics.confusion_matrix(testing_targets, final_testing_predictions)
    # Normalize the confusion matrix by row (i.e by the number of samples in each class)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    ax = sns.heatmap(cm_normalized, cmap='bone_r')
    ax.set_aspect(1)
    #plt.title('Confusion matrix (testing)')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

    if model_dir is not None:
        plt.savefig(model_dir + '\\' + 'confm_testing.eps',
                    dpi=600,
                    format='eps',
                    bbox_inches='tight',
                    pad_inches=0)
        plt.savefig(model_dir + '\\' + 'confm_testing.pdf',
                    dpi=600,
                    format='pdf',
                    bbox_inches='tight',
                    pad_inches=0)
        plt.savefig(model_dir + '\\' + 'confm_testing.png',
                    dpi=600,
                    format='png',
                    bbox_inches='tight',
                    pad_inches=0)

    # display final errors

    print('Final LogLoss (on testing data): %0.2f' % testing_log_loss)

    # convert outputs to pandas DataFrame

    final_testing_predictions = pd.DataFrame(final_testing_predictions,
                                             columns=['Class'],
                                             index=testing_targets.index,
                                             dtype=float)

    return dnn_classifier, final_testing_predictions
Ejemplo n.º 2
0
def train_neural_network_regression_model(
        learning_rate, 
        steps, 
        batch_size,
        hidden_units,
        weight_column,
        dropout,
        batch_norm,
        optimiser,
        model_dir,
        training_features,
        training_targets,
        validation_features,
        validation_targets
        ):
    
    '''
    Args:
        learning rate: the learning rate (float)
        steps: total number of training steps (int)
        batch_size: batch size to used to calculate the gradient (int)
        hidden_units: number of neurons in each layrs (list)
        weight_column: down weight or boost examples during training for unbalanced sets
        dropout: the probability to drop out a node output (for regularisation)
        batch_norm: to use batch normalization after each hidden layer (True/False)
        optimiser: type of the optimiser (GradientDescent, ProximalGradientDescent, Adagrad, ProximalAdagrad, Adam)
        model_dir: directory to save the checkpoint ('None' if no saving)
        training_features: one or more columns of training features (DataFrame)
        training_targets: a single column of training targets (DataFrame)
        validation_features: one or more columns of validation features (DataFrame)
        validation_targets: a single column of validation targets (DataFrame)
        
    Returns:
        A `DNNRegressor` object trained on the training data
    '''
    
    # define periods
    
    periods = 10
    steps_per_period = steps / periods
    
    # create neural network regressor object
    
    if optimiser == 'GradientDescent':
        my_optimiser = tf.train.GradientDescentOptimizer(learning_rate = learning_rate)
    elif optimiser == 'ProximalGradientDescent':
        my_optimiser = tf.train.ProximalGradientDescentOptimizer(learning_rate = learning_rate)
        #my_optimiser = tf.train.ProximalGradientDescentOptimizer(learning_rate = learning_rate, l1_regularization_strength = 0.1) # for L1 regularisation
        #my_optimiser = tf.train.ProximalGradientDescentOptimizer(learning_rate = learning_rate, l2_regularization_strength = 0.1) # for L2 regularisation
    elif optimiser == 'Adagrad':
        my_optimiser = tf.train.AdagradOptimizer(learning_rate = learning_rate) # for convex problems
    elif optimiser == 'ProximalAdagrad':
        my_optimiser = tf.train.ProximalAdagradOptimizer(learning_rate = learning_rate) # for convex problems
        #my_optimiser = tf.train.ProximalAdagradOptimizer(learning_rate = learning_rate, l1_regularization_strength = 0.1) # for L1 regularisation
        #my_optimiser = tf.train.ProximalAdagradOptimizer(learning_rate = learning_rate, l2_regularization_strength = 0.1) # for L2 regularisation
    elif optimiser == 'Adam':
        my_optimiser = tf.train.AdamOptimizer(learning_rate = learning_rate) # for non-convex problems
    else:
        print('Unknown optimiser type')
    my_optimiser = tf.contrib.estimator.clip_gradients_by_norm(my_optimiser, 5.0)
    dnn_regressor = tf.estimator.DNNRegressor(
            feature_columns = construct_feature_columns(training_features),
            model_dir = model_dir,
            hidden_units = hidden_units,
            weight_column = weight_column,
            optimizer = my_optimiser,
            activation_fn = tf.nn.relu,
            dropout = dropout,
            batch_norm = batch_norm)
    
    # define input functions
    
    training_input_fn = lambda: my_input_fn(
      training_features, 
      training_targets, 
      batch_size = batch_size)
    predict_training_input_fn = lambda: my_input_fn(
      training_features, 
      training_targets, 
      num_epochs = 1, 
      shuffle = False)
    predict_validation_input_fn = lambda: my_input_fn(
      validation_features, 
      validation_targets, 
      num_epochs = 1, 
      shuffle = False)
    
    # print training progress
    
    print('Model training started')
    print('RMSE on training data:')
    
    training_rmse = []
    validation_rmse = []

    for period in range (0, periods):
               
        # train the model
               
        dnn_regressor.train(
                input_fn = training_input_fn,
                steps = steps_per_period
                )
                
        # compute predictions
        
        training_predictions = dnn_regressor.predict(input_fn = predict_training_input_fn)
        training_predictions = np.array([item['predictions'][0] for item in training_predictions])
        
        validation_predictions = dnn_regressor.predict(input_fn = predict_validation_input_fn)
        validation_predictions = np.array([item['predictions'][0] for item in validation_predictions])
        
        # calculate losses
        
        training_root_mean_squared_error = math.sqrt(
                metrics.mean_squared_error(training_predictions, training_targets))
        validation_root_mean_squared_error = math.sqrt(
                metrics.mean_squared_error(validation_predictions, validation_targets))
        
        # print the current loss
        
        print('Period %02d: %0.2f' % (period, training_root_mean_squared_error))
        
        # add loss metrics to the list
        
        training_rmse.append(training_root_mean_squared_error)
        validation_rmse.append(validation_root_mean_squared_error)
        
    print('Model training finished')
    
    # plot and save loss metrics over periods
    
    plt.figure(figsize = (6, 4))
    
    plt.xlabel('Periods')
    plt.ylabel('RMSE')
    #plt.title('Root Mean Squared Error vs. Periods')
    plt.tight_layout()
    plt.grid()
    plt.plot(training_rmse, label = 'Training')
    plt.plot(validation_rmse, label = 'Validation')
    plt.legend()
    
    if model_dir is not None:
        plt.savefig(model_dir + '\\' + 'RMSE.eps', dpi = 600, format = 'eps',
                    bbox_inches = 'tight', pad_inches = 0)
        plt.savefig(model_dir + '\\' + 'RMSE.pdf', dpi = 600, format = 'pdf',
                    bbox_inches = 'tight', pad_inches = 0)
        plt.savefig(model_dir + '\\' + 'RMSE.png', dpi = 600, format = 'png',
                    bbox_inches = 'tight', pad_inches = 0)
    
    # plot and save predictions scatter plot
    
    plt.figure(figsize = (6, 4))
    
    plt.xlabel('Targets')
    plt.ylabel('Predictions')
    #plt.title('Prediction accuracy')
    plt.tight_layout()
    plt.grid()
    plt.scatter(training_targets, training_predictions, label = 'Training')
    plt.scatter(validation_targets, validation_predictions, label = 'Validation')
    plt.plot([0, 100], [0, 100], color = 'k')
    plt.legend()
    
    if model_dir is not None:
        plt.savefig(model_dir + '\\' + 'accuracy.eps', dpi = 600, format = 'eps',
                    bbox_inches = 'tight', pad_inches = 0)
        plt.savefig(model_dir + '\\' + 'accuracy.pdf', dpi = 600, format = 'pdf',
                    bbox_inches = 'tight', pad_inches = 0)
        plt.savefig(model_dir + '\\' + 'accuracy.png', dpi = 600, format = 'png',
                    bbox_inches = 'tight', pad_inches = 0)
    
    # display final errors
    
    print('Final RMSE (on training data):   %0.2f' % training_root_mean_squared_error)
    print('Final RMSE (on validation data): %0.2f' % validation_root_mean_squared_error)
    
    # convert outputs to pandas DataFrame
    
    training_predictions = pd.DataFrame(training_predictions, columns = ['Prediction'], 
                                          index = training_targets.index, dtype = float)
    validation_predictions = pd.DataFrame(validation_predictions, columns = ['Prediction'], 
                                            index = validation_targets.index, dtype = float)
    
    return dnn_regressor, training_predictions, validation_predictions
Ejemplo n.º 3
0
def train_model(learning_rate,
    steps,
    batch_size,
    training_examples,
    training_targets,
    validation_examples,
    validation_targets):
  """Trains a linear regression model of multiple features.
  
  In addition to training, this function also prints training progress information,
  as well as a plot of the training and validation loss over time.
  
  Args:
    learning_rate: A `float`, the learning rate.
    steps: A non-zero `int`, the total number of training steps. A training step
      consists of a forward and backward pass using a single batch.
    batch_size: A non-zero `int`, the batch size.
    training_examples: A `DataFrame` containing one or more columns from
      `california_housing_dataframe` to use as input features for training.
    training_targets: A `DataFrame` containing exactly one column from
      `california_housing_dataframe` to use as target for training.
    validation_examples: A `DataFrame` containing one or more columns from
      `california_housing_dataframe` to use as input features for validation.
    validation_targets: A `DataFrame` containing exactly one column from
      `california_housing_dataframe` to use as target for validation.
      
  Returns:
    A `LinearRegressor` object trained on the training data.
  """

  periods = 10
  steps_per_period = steps / periods
  
  # Create a linear regressor object.
  my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
  my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
  linear_regressor = tf.estimator.LinearRegressor(feature_columns=construct_feature_columns(training_examples),
      optimizer=my_optimizer)
  
  # 1.  Create input functions.
  training_input_fn = lambda: my_input_fn(training_examples,
      training_targets["median_house_value"], 
      batch_size=batch_size)

  predict_training_input_fn = lambda: my_input_fn(training_examples, 
      training_targets["median_house_value"], 
      num_epochs=1, 
      shuffle=False)

  predict_validation_input_fn = lambda: my_input_fn(validation_examples,
      validation_targets["median_house_value"], 
      num_epochs=1, 
      shuffle=False)
  
  # Train the model, but do so inside a loop so that we can periodically assess
  # loss metrics.
  print("Training model...")
  print("RMSE (on training data):")
  training_rmse = []
  validation_rmse = []
  for period in range(0, periods):
    # Train the model, starting from the prior state.
    linear_regressor.train(input_fn=training_input_fn,
        steps=steps_per_period,)
    # 2.  Take a break and compute predictions.
    training_predictions = linear_regressor.predict(input_fn=predict_training_input_fn)
    training_predictions = np.array([item['predictions'][0] for item in training_predictions])
    
    validation_predictions = linear_regressor.predict(input_fn=predict_validation_input_fn)
    validation_predictions = np.array([item['predictions'][0] for item in validation_predictions])
    
    # Compute training and validation loss.
    training_root_mean_squared_error = math.sqrt(metrics.mean_squared_error(training_predictions, training_targets))
    validation_root_mean_squared_error = math.sqrt(metrics.mean_squared_error(validation_predictions, validation_targets))
    # Occasionally print the current loss.
    print("  period %02d : %0.2f" % (period, training_root_mean_squared_error))
    # Add the loss metrics from this period to our list.
    training_rmse.append(training_root_mean_squared_error)
    validation_rmse.append(validation_root_mean_squared_error)
  print("Model training finished.")

  # Output a graph of loss metrics over periods.
  plt.ylabel("RMSE")
  plt.xlabel("Periods")
  plt.title("Root Mean Squared Error vs. Periods")
  plt.tight_layout()
  plt.plot(training_rmse, label="training")
  plt.plot(validation_rmse, label="validation")
  plt.legend()
  plt.show()

  return linear_regressor
def test_neural_network_classification_model(learning_rate, steps, batch_size,
                                             hidden_units, weight_column,
                                             dropout, batch_norm, optimiser,
                                             model_dir, testing_features,
                                             testing_targets):
    '''
    Args:
        learning rate: the learning rate (float)
        steps: total number of training steps (int)
        batch_size: batch size to used to calculate the gradient (int)
        hidden_units: number of neurons in each layrs (list)
        weight_column: down weight or boost examples during training for unbalanced sets
        dropout: the probability to drop out a node output (for regularisation)
        batch_norm: to use batch normalization after each hidden layer (True/False)
        optimiser: type of the optimiser (GradientDescent, ProximalGradientDescent, Adagrad, ProximalAdagrad, Adam)
        model_dir: directory where the pretrained model is saved
        testing_features: one or more columns of testing features (DataFrame)
        testing_targets: a single column of testing targets (DataFrame)
        
    Returns:
        A `DNNClassifier` object trained on the training data
    '''

    # create neural network classifier object

    if optimiser == 'GradientDescent':
        my_optimiser = tf.train.GradientDescentOptimizer(
            learning_rate=learning_rate)
    elif optimiser == 'ProximalGradientDescent':
        my_optimiser = tf.train.ProximalGradientDescentOptimizer(
            learning_rate=learning_rate)
        #my_optimiser = tf.train.ProximalGradientDescentOptimizer(learning_rate = learning_rate, l1_regularization_strength = 0.1) # for L1 regularisation
        #my_optimiser = tf.train.ProximalGradientDescentOptimizer(learning_rate = learning_rate, l2_regularization_strength = 0.1) # for L2 regularisation
    elif optimiser == 'Adagrad':
        my_optimiser = tf.train.AdagradOptimizer(
            learning_rate=learning_rate)  # for convex problems
    elif optimiser == 'ProximalAdagrad':
        my_optimiser = tf.train.ProximalAdagradOptimizer(
            learning_rate=learning_rate)  # for convex problems
        #my_optimiser = tf.train.ProximalAdagradOptimizer(learning_rate = learning_rate, l1_regularization_strength = 0.1) # for L1 regularisation
        #my_optimiser = tf.train.ProximalAdagradOptimizer(learning_rate = learning_rate, l2_regularization_strength = 0.1) # for L2 regularisation
    elif optimiser == 'Adam':
        my_optimiser = tf.train.AdamOptimizer(
            learning_rate=learning_rate)  # for non-convex problems
    else:
        print('Unknown optimiser type')
    my_optimiser = tf.contrib.estimator.clip_gradients_by_norm(
        my_optimiser, 5.0)
    dnn_classifier = tf.estimator.DNNClassifier(
        feature_columns=construct_feature_columns(testing_features),
        model_dir=model_dir,
        n_classes=2,
        hidden_units=hidden_units,
        weight_column=weight_column,
        optimizer=my_optimiser,
        activation_fn=tf.nn.relu,
        dropout=dropout,
        batch_norm=batch_norm)

    # define input function

    predict_testing_input_fn = lambda: my_input_fn(
        testing_features, testing_targets, num_epochs=1, shuffle=False)

    # calculate testing probabilities

    testing_probabilities = dnn_classifier.predict(
        input_fn=predict_testing_input_fn)
    testing_probabilities = np.array(
        [item['probabilities'] for item in testing_probabilities])

    # calculate loss

    testing_log_loss = metrics.log_loss(testing_targets, testing_probabilities)

    # get just the probabilities for the positive class

    testing_probabilities = testing_probabilities[:, 1]

    # calculate and plot ROC curves

    testing_false_positive_rate, testing_true_positive_rate, testing_thresholds = metrics.roc_curve(
        testing_targets, testing_probabilities)

    plt.subplot(1, 2, 2)
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC')
    plt.tight_layout()
    plt.grid()
    plt.plot(testing_false_positive_rate,
             testing_true_positive_rate,
             label='Testing')
    plt.plot([0, 1], [0, 1], color='k')
    plt.legend()

    # display final errors

    print('LogLoss (on testing data): %0.2f' % testing_log_loss)

    # calculate and print evaluation metrics

    testing_evaluation_metrics = dnn_classifier.evaluate(
        input_fn=predict_testing_input_fn)

    print('AUC (on testing data): %0.2f' % testing_evaluation_metrics['auc'])

    # convert outputs to pandas DataFrame

    testing_probabilities = pd.DataFrame(testing_probabilities,
                                         columns=['Probability'],
                                         index=testing_targets.index,
                                         dtype=float)

    return dnn_classifier, testing_probabilities
Ejemplo n.º 5
0
def train_linear_classification_model(
        learning_rate, 
        steps, 
        batch_size, 
        optimiser,
        training_features,
        training_targets,
        validation_features,
        validation_targets
        ):
    
    '''
    Args:
        learning rate: the learning rate (float)
        steps: total number of training steps (int)
        batch_size: batch size to used to calculate the gradient (int)
        optimiser: type of the optimiser (GradientDescent, Ftrl)
        training_features: one or more columns of training features (DataFrame)
        training_targets: a single column of training targets (DataFrame)
        validation_features: one or more columns of validation features (DataFrame)
        validation_targets: a single column of validation targets (DataFrame)
        
    Returns:
        A `LinearClassifier` object trained on the training data
    '''
    
    # define periods
    
    periods = 10
    steps_per_period = steps / periods
    
    # create linear classifier object
    
    if optimiser == 'GradientDescent':
        my_optimiser = tf.train.GradientDescentOptimizer(learning_rate = learning_rate)
    elif optimiser == 'Ftrl':
        my_optimiser = tf.train.FtrlOptimizer(learning_rate = learning_rate) # for high-dimensional linear models
        #my_optimiser = tf.train.FtrlOptimizer(learning_rate = learning_rate, l1_regularization_strength = 0.1) # for L1 regularisation
        #my_optimiser = tf.train.FtrlOptimizer(learning_rate = learning_rate, l2_regularization_strength = 0.1) # for L2 regularisation
    else:
        print('Unknown optimiser type')
    my_optimiser = tf.contrib.estimator.clip_gradients_by_norm(my_optimiser, 5.0)
    linear_classifier = tf.estimator.LinearClassifier(
            feature_columns = construct_feature_columns(training_features),
            optimizer = my_optimiser)
    
    # define input functions
    
    training_input_fn = lambda: my_input_fn(
      training_features, 
      training_targets, 
      batch_size = batch_size)
    predict_training_input_fn = lambda: my_input_fn(
      training_features, 
      training_targets, 
      num_epochs = 1, 
      shuffle = False)
    predict_validation_input_fn = lambda: my_input_fn(
      validation_features, 
      validation_targets, 
      num_epochs = 1, 
      shuffle = False)
    
    # print training progress
    
    print('Model training started')
    print('LogLoss on training data:')
    
    training_log_losses = []
    validation_log_losses = []

    for period in range (0, periods):
               
        # train the model
               
        linear_classifier.train(
                input_fn = training_input_fn,
                steps = steps_per_period
                )
                
        # compute predictions
        
        training_probabilities = linear_classifier.predict(input_fn = predict_training_input_fn)
        training_probabilities = np.array([item['probabilities'] for item in training_probabilities])
        
        validation_probabilities = linear_classifier.predict(input_fn = predict_validation_input_fn)
        validation_probabilities = np.array([item['probabilities'] for item in validation_probabilities])
        
        # calculate losses
        
        training_log_loss = metrics.log_loss(training_targets, training_probabilities)
        validation_log_loss = metrics.log_loss(validation_targets, validation_probabilities)
        
        # print the current loss
        
        print('Period %02d: %0.2f' % (period, training_log_loss))
        
        # add loss metrics to the list
        
        training_log_losses.append(training_log_loss)
        validation_log_losses.append(validation_log_loss)
        
    print('Model training finished')
    
    # plot loss metrics over periods
    
    plt.figure(figsize = (12, 4))
    
    plt.subplot(1, 2, 1)
    plt.xlabel('Periods')
    plt.ylabel('LogLoss')
    plt.title('LogLoss vs. Periods')
    plt.tight_layout()
    plt.grid()
    plt.plot(training_log_losses, label = 'Training')
    plt.plot(validation_log_losses, label = 'Validation')
    plt.legend()
    
    # get just the probabilities for the positive class
    
    training_probabilities = training_probabilities[:, 1]
    validation_probabilities = validation_probabilities[:, 1]
    
    # calculate and plot ROC curves
    
    training_false_positive_rate, training_true_positive_rate, training_thresholds = metrics.roc_curve(
            training_targets, training_probabilities)

    validation_false_positive_rate, validation_true_positive_rate, validation_thresholds = metrics.roc_curve(
            validation_targets, validation_probabilities)
    
    plt.subplot(1, 2, 2)
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC')
    plt.tight_layout()
    plt.grid()
    plt.plot(training_false_positive_rate, training_true_positive_rate, label = 'Training')
    plt.plot(validation_false_positive_rate, validation_true_positive_rate, label = 'Validation')
    plt.plot([0, 1], [0, 1], color = 'k')
    plt.legend()
    
    # display final errors
    
    print('Final LogLoss (on training data):   %0.2f' % training_log_loss)
    print('Final LogLoss (on validation data): %0.2f' % validation_log_loss)
    
    # calculate and print evaluation metrics
    
    training_evaluation_metrics = linear_classifier.evaluate(input_fn = predict_training_input_fn)
    validation_evaluation_metrics = linear_classifier.evaluate(input_fn = predict_validation_input_fn)

    print('AUC (on training data): %0.2f' % training_evaluation_metrics['auc'])
    print('AUC (on validation data): %0.2f' % validation_evaluation_metrics['auc'])
    
    # convert outputs to pandas DataFrame
    
    training_probabilities = pd.DataFrame(training_probabilities, columns = ['Probability'], 
                                          index = training_targets.index, dtype = float)
    validation_probabilities = pd.DataFrame(validation_probabilities, columns = ['Probability'], 
                                            index = validation_targets.index, dtype = float)
    
    return linear_classifier, training_probabilities, validation_probabilities