def test_training_with_warm_start(self):
        """
        Training with a user provided model for warm start.
        """
        # Get trainer object, but only train 1 L-BFGS step.
        binary_lr_trainer = BinaryLogisticRegressionTrainer(lambda_l2=0.0,
                                                            max_iter=1)
        coefficients_warm_start = binary_lr_trainer.fit(
            X=self.x_train,
            y=self.y_train,
            weights=None,
            offsets=None,
            theta_initial=self.custom_weights)[0]
        # Warm start.
        # The trained model should be close to initial value
        # since the solution should have already converged.
        self.assertAllClose(coefficients_warm_start,
                            self.custom_weights,
                            rtol=_TOLERANCE,
                            atol=_TOLERANCE,
                            msg='models mismatch')

        coefficients_code_start = binary_lr_trainer.fit(X=self.x_train,
                                                        y=self.y_train,
                                                        weights=None,
                                                        offsets=None,
                                                        theta_initial=None)[0]
        # Code start
        # The trained model should be far from initial value since we only train 1 step,
        # while the initial model was trained for 100 steps.
        self.assertNotAllClose(coefficients_code_start,
                               self.custom_weights,
                               msg='models are too close')
Ejemplo n.º 2
0
 def __init__(self, consumer_id, regularize_bias=False, lambda_l2=1.0, tolerance=1e-8, num_of_curvature_pairs=10,
              num_iterations=100):
     self.consumer_id = consumer_id
     self.lr_trainer = BinaryLogisticRegressionTrainer(regularize_bias=regularize_bias, lambda_l2=lambda_l2,
                                                       precision=tolerance/np.finfo(float).eps,
                                                       num_lbfgs_corrections=num_of_curvature_pairs,
                                                       max_iter=num_iterations)
     self.processed_counter = 0
 def test_scoring_should_fail_if_not_trained(self):
     """
     Inference should fail on untrained model
     """
     # Reset trainer object
     self.binary_lr_trainer = BinaryLogisticRegressionTrainer()
     with self.assertRaises(Exception):
         self.binary_lr_trainer.predict_proba(X=self.x_test, offsets=None)
 def test_scoring_should_succeed_if_custom_weights_provided(self):
     """
     Inference should succeed on untrained model if custom weights provided
     """
     # Reset trainer object
     self.binary_lr_trainer = BinaryLogisticRegressionTrainer()
     validation_pred = self.binary_lr_trainer.predict_proba(
         X=self.x_test, offsets=None, custom_theta=self.custom_weights)
     assert (validation_pred.shape[0] == self.x_test.shape[0])
 def test_metrics_computation_should_fail_if_model_not_trained(self):
     """
     Metrics computation should fail on untrained model
     """
     # Reset trainer object
     self.binary_lr_trainer = BinaryLogisticRegressionTrainer()
     with self.assertRaises(Exception):
         self.binary_lr_trainer.compute_metrics(X=self.x_test,
                                                y=self.y_test,
                                                offsets=None)
 def test_metrics_computation_should_succeed_if_custom_weights_provided(self):
     """
     Metrics computation should succeed on untrained model if custom weights provided
     """
     # Reset trainer object
     self.binary_lr_trainer = BinaryLogisticRegressionTrainer()
     validation_metrics = self.binary_lr_trainer.compute_metrics(X=self.x_test,
                                                                 y=self.y_test,
                                                                 offsets=None,
                                                                 custom_theta=self.custom_weights)
     assert (0.0 <= validation_metrics['auc'] <= 1.0)
 def test_scoring_should_fail_if_custom_weights_not_of_known_type(self):
     """
     Inference should fail if custom weights are neither Numpy ndarray or Scipy sparse amtrix
     """
     # Reset trainer object
     self.binary_lr_trainer = BinaryLogisticRegressionTrainer()
     # Run inference using a Python list, which is neither a numpy ndarray nor a scipy matrix
     with self.assertRaises(Exception):
         self.binary_lr_trainer.predict_proba(X=self.x_test,
                                              offsets=None,
                                              custom_theta=self.custom_weights.tolist())
    def setUp(self):
        # Since grid machines may or may not have access to internet,
        # using a pickled instance of popular open-source breast cancer dataset for testing
        sample_dataset = pickle.load(open(sample_dataset_path + "/sklearn_data.p", "rb"))
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(sample_dataset.data,
                                                                                sample_dataset.target,
                                                                                test_size=0.25,
                                                                                random_state=0)

        self.binary_lr_trainer = BinaryLogisticRegressionTrainer()
        self.custom_weights = self.binary_lr_trainer.fit(X=self.x_train,
                                                         y=self.y_train,
                                                         weights=None,
                                                         offsets=None)[0]
Ejemplo n.º 9
0
    def _predict(self, pool, input_path, metadata, tensor_metadata, output_file, schema_params, num_features, metadata_file, model_weights, use_local_index):
        logger.info(f"Start inference for {input_path}.")
        # Create LR model object for inference
        lr_model = BinaryLogisticRegressionTrainer(regularize_bias=True, lambda_l2=self.model_params.l2_reg_weight)
        consumer = InferenceJobConsumer(lr_model, num_features, schema_params, use_local_index, name=input_path)

        results = self._pooled_action(pool, consumer, input_path, schema_params, model_weights, num_features, metadata_file, gen_index_map=False)

        # Set up output schema
        output_schema = fastavro.parse_schema(get_inference_output_avro_schema(
            metadata,
            has_logits_per_coordinate=True,  # Always true for custom scipy-based LR
            schema_params=schema_params,
            has_weight=any(schema_params.sample_weight == feature.name for feature in tensor_metadata.get_features())))
        batched_write_avro(itertools.chain.from_iterable(results), output_file, output_schema)
        logger.info(f"Inference complete: {input_path}.")
Ejemplo n.º 10
0
class TrainingJobConsumer:
    """
    Callable class to consume entity-based random effect training jobs from a shared queue
    """
    _CONSUMER_LOGGING_FREQUENCY = 1000

    def __init__(self, consumer_id, regularize_bias=False, lambda_l2=1.0, tolerance=1e-8, num_of_curvature_pairs=10,
                 num_iterations=100):
        self.consumer_id = consumer_id
        self.lr_trainer = BinaryLogisticRegressionTrainer(regularize_bias=regularize_bias, lambda_l2=lambda_l2,
                                                          precision=tolerance/np.finfo(float).eps,
                                                          num_lbfgs_corrections=num_of_curvature_pairs,
                                                          max_iter=num_iterations)
        self.processed_counter = 0

    def __call__(self, training_job_queue, training_results_dict, get_timeout_in_seconds=300):
        """
        Call method to read training jobs off of a shared queue
        :param training_job_queue:      Shared multiprocessing job queue
        :param training_results_dict:   Shared dictionary to store training results
        :param get_timeout_in_seconds:   Timeout (in seconds) for retrieving items off the shared job queue
        :return: None
        """
        logger.info("Kicking off training job consumer with ID : {}".format(self.consumer_id))
        while True:
            # Extract TrainingJob object
            training_job = training_job_queue.get(True, get_timeout_in_seconds)
            # If producer is done producing jobs, terminate consumer
            if training_job is None:
                logger.info("Terminating consumer {}".format(self.consumer_id))
                break

            # Train model
            training_result = self.lr_trainer.fit(X=training_job.X,
                                                  y=training_job.y,
                                                  weights=training_job.weights,
                                                  offsets=training_job.offsets)
            # Map trained model to entity ID
            training_results_dict[training_job.entity_id] = TrainingResult(training_result=training_result[0],
                                                                           unique_global_indices=training_job.
                                                                           unique_global_indices)

            self.processed_counter += 1
            if self.processed_counter % TrainingJobConsumer._CONSUMER_LOGGING_FREQUENCY == 0:
                logger.info("Consumer job {} has completed {} training jobs so far".format(self.consumer_id,
                                                                                           self.processed_counter))
Ejemplo n.º 11
0
 def _train(self, pool, input_path, metadata_file, model_weights: dict, num_features, schema_params, output_model_file):
     logger.info(f"Start training with {f'loaded {len(model_weights)} previous models' if model_weights else 'zeros'} as the model initial value.")
     lr_model = BinaryLogisticRegressionTrainer(regularize_bias=self.model_params.regularize_bias, lambda_l2=self.model_params.l2_reg_weight,
                                                precision=self.model_params.lbfgs_tolerance / np.finfo(float).eps,
                                                num_lbfgs_corrections=self.model_params.num_of_lbfgs_curvature_pairs,
                                                max_iter=self.model_params.num_of_lbfgs_iterations)
     consumer = TrainingJobConsumer(lr_model, name=input_path)
     results = self._pooled_action(pool, consumer, input_path, schema_params, model_weights, num_features, metadata_file,
                                   self.model_params.enable_local_indexing)
     model_weights.update(results)
     logger.info(f"{len(model_weights)} models in total after training/refreshing.")
     # Dump results to model output directory.
     if self.model_params.feature_file:
         self._save_model(output_model_file, model_coefficients=model_weights, num_features=num_features, feature_file=self.model_params.feature_file)
     else:
         logger.info("Both feature file and avro model output directory required to export model. Skipping export")
     return model_weights
Ejemplo n.º 12
0
 def _train(self, pool, input_path, metadata_file, model_weights: dict,
            num_features, schema_params, output_model_file):
     logger.info(
         f"Start training with {f'loaded {len(model_weights)} previous models' if model_weights else 'zeros'} as the model initial value."
     )
     lr_model = BinaryLogisticRegressionTrainer(
         regularize_bias=self.model_params.regularize_bias,
         lambda_l2=self.model_params.l2_reg_weight,
         precision=self.model_params.lbfgs_tolerance / np.finfo(float).eps,
         num_lbfgs_corrections=self.model_params.
         num_of_lbfgs_curvature_pairs,
         max_iter=self.model_params.num_of_lbfgs_iterations,
         has_intercept=self.has_intercept)
     consumer = TrainingJobConsumer(
         lr_model,
         name=input_path,
         job_queue=self.job_queue,
         enable_local_indexing=self.model_params.enable_local_indexing,
         sparsity_threshold=self.model_params.sparsity_threshold,
         variance_mode=self.model_params.random_effect_variance_mode)
     # Make sure the queue is empty
     assert (self.job_queue.empty())
     results = self._pooled_action(pool, consumer, input_path,
                                   schema_params, model_weights,
                                   num_features, metadata_file,
                                   self.model_params.enable_local_indexing)
     # The trained model should be updated by the prior model to cover the following two cases:
     # (1) the prior model has extra features that are not present in the current datasets.
     # (2) the prior model has extra model_ids that are not present in the current datasets.
     # In both cases, the extra features/model_id needs to be copied to the current models.
     # This is needed especially incremental learning is implemented.
     # It is not needed when the prior model and current model share the same features / model_ids.
     # Revisit this when we start working on more advanced warm start.
     model_weights.update(results)
     logger.info(
         f"{len(model_weights)} models in total after training/refreshing.")
     # Dump results to model output directory.
     self._save_model(output_model_file,
                      model_coefficients=model_weights,
                      num_features=num_features,
                      feature_file=self.feature_file)
     return model_weights
Ejemplo n.º 13
0
    def _predict(self, inference_dataset, model_coefficients, metadata,
                 tensor_metadata, output_file, prediction_params):

        # Create LR trainer object for inference
        lr_trainer = BinaryLogisticRegressionTrainer(
            regularize_bias=True,
            lambda_l2=self.model_params[constants.L2_REG_WEIGHT])

        # Create PhotonMLWriter object
        prediction_params.update(self.model_params)
        inference_runner = PhotonMLWriter(schema_params=prediction_params)

        # Delegate inference to PhotonMLWriter object
        inference_runner.run_custom_scipy_re_inference(
            inference_dataset=inference_dataset,
            model_coefficients=model_coefficients,
            lr_model=lr_trainer,
            metadata=metadata,
            tensor_metadata=tensor_metadata,
            output_file=output_file)
Ejemplo n.º 14
0
    def _predict(self, pool, input_path, metadata, tensor_metadata,
                 output_file, schema_params, num_features, metadata_file,
                 model_weights):
        logger.info(f"Start inference for {input_path}.")
        # Create LR model object for inference
        lr_model = BinaryLogisticRegressionTrainer(
            regularize_bias=True,
            lambda_l2=self.model_params.l2_reg_weight,
            has_intercept=self.has_intercept)
        consumer = InferenceJobConsumer(lr_model,
                                        num_features,
                                        schema_params,
                                        name=input_path,
                                        job_queue=self.job_queue)
        # Make sure the queue is empty
        assert (self.job_queue.empty())
        # Prediction does not use local indexing since it can work on sparse coefficients directly.
        results = self._pooled_action(pool,
                                      consumer,
                                      input_path,
                                      schema_params,
                                      model_weights,
                                      num_features,
                                      metadata_file,
                                      enable_local_indexing=False)

        # Set up output schema
        output_schema = fastavro.parse_schema(
            get_inference_output_avro_schema(
                metadata,
                has_logits_per_coordinate=
                True,  # Always true for custom scipy-based LR
                schema_params=schema_params,
                has_weight=any(schema_params.weight_column_name == feature.name
                               for feature in tensor_metadata.get_features())))
        batched_write_avro(itertools.chain.from_iterable(results), output_file,
                           output_schema)
        logger.info(f"Inference complete: {input_path}.")
class TestBinaryLogisticRegressionTrainer(tf.test.TestCase):
    """
    Test binary logistic regression trainer
    """

    def setUp(self):
        # Since grid machines may or may not have access to internet,
        # using a pickled instance of popular open-source breast cancer dataset for testing
        sample_dataset = pickle.load(open(sample_dataset_path + "/sklearn_data.p", "rb"))
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(sample_dataset.data,
                                                                                sample_dataset.target,
                                                                                test_size=0.25,
                                                                                random_state=0)

        self.binary_lr_trainer = BinaryLogisticRegressionTrainer()
        self.custom_weights = self.binary_lr_trainer.fit(X=self.x_train,
                                                         y=self.y_train,
                                                         weights=None,
                                                         offsets=None)[0]

    def test_on_dense_dataset(self):
        """
        Test training on a dense dataset
        """
        # Train on sample data
        self.binary_lr_trainer.fit(X=self.x_train,
                                   y=self.y_train,
                                   weights=None,
                                   offsets=None)

        # Get predictions and metrics on the training data
        training_pred = self.binary_lr_trainer.predict_proba(X=self.x_train,
                                                             offsets=None)
        training_metrics = self.binary_lr_trainer.compute_metrics(X=self.x_train,
                                                                  y=self.y_train,
                                                                  offsets=None)
        # Assert prediction shape matches expectation, and training metrics are within expected range
        assert (0.0 <= training_metrics['auc'] <= 1.0)
        assert (training_pred.shape[0] == self.x_train.shape[0])

    def test_on_sparse_dataset(self):
        """
        Test training on a sparse dataset
        """
        # Train on sparsified sample data
        self.binary_lr_trainer.fit(X=sparse.csr_matrix(self.x_train),
                                   y=self.y_train,
                                   weights=None,
                                   offsets=None)

        # Get predictions and metrics on the training data
        training_pred = self.binary_lr_trainer.predict_proba(X=sparse.csr_matrix(self.x_train),
                                                             offsets=None)
        training_metrics = self.binary_lr_trainer.compute_metrics(X=sparse.csr_matrix(self.x_train),
                                                                  y=self.y_train,
                                                                  offsets=None)
        # Assert prediction shape matches expectation, and training metrics are within expected range
        assert (0.0 <= training_metrics['auc'] <= 1.0)
        assert (training_pred.shape[0] == self.x_train.shape[0])

    def test_scoring_on_validation_data(self):
        """
        Test inference and metrics computation
        """
        # Train on sample data
        self.binary_lr_trainer.fit(X=sparse.csr_matrix(self.x_train),
                                   y=self.y_train,
                                   weights=None,
                                   offsets=None)

        # Get predictions and metrics on the test data
        validation_pred = self.binary_lr_trainer.predict_proba(X=self.x_test,
                                                               offsets=None)
        validation_metrics = self.binary_lr_trainer.compute_metrics(X=self.x_test,
                                                                    y=self.y_test,
                                                                    offsets=None)

        # Assert prediction shape matches expectation, and training metrics are within expected range
        assert (0.0 <= validation_metrics['auc'] <= 1.0)
        assert (validation_pred.shape[0] == self.x_test.shape[0])

    def test_scoring_should_fail_if_not_trained(self):
        """
        Inference should fail on untrained model
        """
        # Reset trainer object
        self.binary_lr_trainer = BinaryLogisticRegressionTrainer()
        with self.assertRaises(Exception):
            self.binary_lr_trainer.predict_proba(X=self.x_test,
                                                 offsets=None)

    def test_scoring_should_fail_if_custom_weights_not_of_known_type(self):
        """
        Inference should fail if custom weights are neither Numpy ndarray or Scipy sparse amtrix
        """
        # Reset trainer object
        self.binary_lr_trainer = BinaryLogisticRegressionTrainer()
        # Run inference using a Python list, which is neither a numpy ndarray nor a scipy matrix
        with self.assertRaises(Exception):
            self.binary_lr_trainer.predict_proba(X=self.x_test,
                                                 offsets=None,
                                                 custom_theta=self.custom_weights.tolist())

    def test_metrics_computation_should_fail_if_model_not_trained(self):
        """
        Metrics computation should fail on untrained model
        """
        # Reset trainer object
        self.binary_lr_trainer = BinaryLogisticRegressionTrainer()
        with self.assertRaises(Exception):
            self.binary_lr_trainer.compute_metrics(X=self.x_test,
                                                   y=self.y_test,
                                                   offsets=None)

    def test_scoring_should_succeed_if_custom_weights_provided(self):
        """
        Inference should succeed on untrained model if custom weights provided
        """
        # Reset trainer object
        self.binary_lr_trainer = BinaryLogisticRegressionTrainer()
        validation_pred = self.binary_lr_trainer.predict_proba(X=self.x_test,
                                                               offsets=None,
                                                               custom_theta=self.custom_weights)
        assert (validation_pred.shape[0] == self.x_test.shape[0])

    def test_metrics_computation_should_succeed_if_custom_weights_provided(self):
        """
        Metrics computation should succeed on untrained model if custom weights provided
        """
        # Reset trainer object
        self.binary_lr_trainer = BinaryLogisticRegressionTrainer()
        validation_metrics = self.binary_lr_trainer.compute_metrics(X=self.x_test,
                                                                    y=self.y_test,
                                                                    offsets=None,
                                                                    custom_theta=self.custom_weights)
        assert (0.0 <= validation_metrics['auc'] <= 1.0)
class TestBinaryLogisticRegressionTrainer(tf.test.TestCase):
    """
    Test binary logistic regression trainer
    """
    def setUp(self):
        # Since grid machines may or may not have access to internet,
        # using a pickled instance of popular open-source breast cancer dataset for testing
        sample_dataset = pickle.load(
            open(sample_dataset_path + "/sklearn_data.p", "rb"))
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(
            sample_dataset.data,
            sample_dataset.target,
            test_size=0.25,
            random_state=0)

        self.binary_lr_trainer = BinaryLogisticRegressionTrainer(max_iter=500)
        self.custom_weights = self.binary_lr_trainer.fit(X=self.x_train,
                                                         y=self.y_train,
                                                         weights=None,
                                                         offsets=None)[0]

    def test_on_dense_dataset(self):
        """
        Test training on a dense dataset
        """
        # Train on sample data
        self.binary_lr_trainer.fit(X=self.x_train,
                                   y=self.y_train,
                                   weights=None,
                                   offsets=None)

        # Get predictions and metrics on the training data
        training_pred = self.binary_lr_trainer.predict_proba(X=self.x_train,
                                                             offsets=None)
        training_metrics = self.binary_lr_trainer.compute_metrics(
            X=self.x_train, y=self.y_train, offsets=None)
        # Assert prediction shape matches expectation, and training metrics are within expected range
        assert (0.0 <= training_metrics['auc'] <= 1.0)
        assert (training_pred.shape[0] == self.x_train.shape[0])

    def test_on_sparse_dataset(self):
        """
        Test training on a sparse dataset
        """
        # Train on sparsified sample data
        self.binary_lr_trainer.fit(X=sparse.csr_matrix(self.x_train),
                                   y=self.y_train,
                                   weights=None,
                                   offsets=None)

        # Get predictions and metrics on the training data
        training_pred = self.binary_lr_trainer.predict_proba(
            X=sparse.csr_matrix(self.x_train), offsets=None)
        training_metrics = self.binary_lr_trainer.compute_metrics(
            X=sparse.csr_matrix(self.x_train), y=self.y_train, offsets=None)
        # Assert prediction shape matches expectation, and training metrics are within expected range
        assert (0.0 <= training_metrics['auc'] <= 1.0)
        assert (training_pred.shape[0] == self.x_train.shape[0])

    def test_scoring_on_validation_data(self):
        """
        Test inference and metrics computation
        """
        # Train on sample data
        self.binary_lr_trainer.fit(X=sparse.csr_matrix(self.x_train),
                                   y=self.y_train,
                                   weights=None,
                                   offsets=None)

        # Get predictions and metrics on the test data
        validation_pred = self.binary_lr_trainer.predict_proba(X=self.x_test,
                                                               offsets=None)
        validation_metrics = self.binary_lr_trainer.compute_metrics(
            X=self.x_test, y=self.y_test, offsets=None)

        # Assert prediction shape matches expectation, and training metrics are within expected range
        assert (0.0 <= validation_metrics['auc'] <= 1.0)
        assert (validation_pred.shape[0] == self.x_test.shape[0])

    def test_scoring_should_fail_if_not_trained(self):
        """
        Inference should fail on untrained model
        """
        # Reset trainer object
        self.binary_lr_trainer = BinaryLogisticRegressionTrainer()
        with self.assertRaises(Exception):
            self.binary_lr_trainer.predict_proba(X=self.x_test, offsets=None)

    def test_scoring_should_fail_if_custom_weights_not_of_known_type(self):
        """
        Inference should fail if custom weights are neither Numpy ndarray or Scipy sparse amtrix
        """
        # Reset trainer object
        self.binary_lr_trainer = BinaryLogisticRegressionTrainer()
        # Run inference using a Python list, which is neither a numpy ndarray nor a scipy matrix
        with self.assertRaises(Exception):
            self.binary_lr_trainer.predict_proba(
                X=self.x_test,
                offsets=None,
                custom_theta=self.custom_weights.tolist())

    def test_metrics_computation_should_fail_if_model_not_trained(self):
        """
        Metrics computation should fail on untrained model
        """
        # Reset trainer object
        self.binary_lr_trainer = BinaryLogisticRegressionTrainer()
        with self.assertRaises(Exception):
            self.binary_lr_trainer.compute_metrics(X=self.x_test,
                                                   y=self.y_test,
                                                   offsets=None)

    def test_scoring_should_succeed_if_custom_weights_provided(self):
        """
        Inference should succeed on untrained model if custom weights provided
        """
        # Reset trainer object
        self.binary_lr_trainer = BinaryLogisticRegressionTrainer()
        validation_pred = self.binary_lr_trainer.predict_proba(
            X=self.x_test, offsets=None, custom_theta=self.custom_weights)
        assert (validation_pred.shape[0] == self.x_test.shape[0])

    def test_metrics_computation_should_succeed_if_custom_weights_provided(
            self):
        """
        Metrics computation should succeed on untrained model if custom weights provided
        """
        # Reset trainer object
        self.binary_lr_trainer = BinaryLogisticRegressionTrainer()
        validation_metrics = self.binary_lr_trainer.compute_metrics(
            X=self.x_test,
            y=self.y_test,
            offsets=None,
            custom_theta=self.custom_weights)
        assert (0.0 <= validation_metrics['auc'] <= 1.0)

    def test_training_with_warm_start(self):
        """
        Training with a user provided model for warm start.
        """
        # Get trainer object, but only train 1 L-BFGS step.
        binary_lr_trainer = BinaryLogisticRegressionTrainer(lambda_l2=0.0,
                                                            max_iter=1)
        coefficients_warm_start = binary_lr_trainer.fit(
            X=self.x_train,
            y=self.y_train,
            weights=None,
            offsets=None,
            theta_initial=self.custom_weights)[0]
        # Warm start.
        # The trained model should be close to initial value
        # since the solution should have already converged.
        self.assertAllClose(coefficients_warm_start,
                            self.custom_weights,
                            rtol=_TOLERANCE,
                            atol=_TOLERANCE,
                            msg='models mismatch')

        coefficients_code_start = binary_lr_trainer.fit(X=self.x_train,
                                                        y=self.y_train,
                                                        weights=None,
                                                        offsets=None,
                                                        theta_initial=None)[0]
        # Code start
        # The trained model should be far from initial value since we only train 1 step,
        # while the initial model was trained for 100 steps.
        self.assertNotAllClose(coefficients_code_start,
                               self.custom_weights,
                               msg='models are too close')
class TestBinaryLogisticRegressionTrainer(tf.test.TestCase):
    """
    Test binary logistic regression trainer
    """
    def setUp(self):
        # Since grid machines may or may not have access to internet,
        # using a pickled instance of popular open-source breast cancer dataset for testing
        sample_dataset = pickle.load(
            open(sample_dataset_path + "/sklearn_data.p", "rb"))
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(
            sample_dataset.data,
            sample_dataset.target,
            test_size=0.25,
            random_state=0)

        self.binary_lr_trainer = BinaryLogisticRegressionTrainer(max_iter=1000)
        self.binary_lr_trainer_without_bias = BinaryLogisticRegressionTrainer(
            max_iter=1000, has_intercept=False)
        self.custom_weights = self.binary_lr_trainer.fit(X=self.x_train,
                                                         y=self.y_train,
                                                         weights=None,
                                                         offsets=None)[0][0]

    def test_on_dense_dataset(self):
        """
        Test training on a dense dataset
        """
        # Train on sample data
        self.binary_lr_trainer.fit(X=self.x_train,
                                   y=self.y_train,
                                   weights=None,
                                   offsets=None)

        # Get predictions and metrics on the training data
        training_pred = self.binary_lr_trainer.predict_proba(X=self.x_train,
                                                             offsets=None)
        training_metrics = self.binary_lr_trainer.compute_metrics(
            X=self.x_train, y=self.y_train, offsets=None)
        # Assert prediction shape matches expectation, and training metrics are within expected range
        assert (0.0 <= training_metrics['auc'] <= 1.0)
        assert (training_pred.shape[0] == self.x_train.shape[0])

    def test_on_sparse_dataset(self):
        """
        Test training on a sparse dataset
        """
        # Train on sparsified sample data
        self.binary_lr_trainer.fit(X=sparse.csr_matrix(self.x_train),
                                   y=self.y_train,
                                   weights=None,
                                   offsets=None)

        # Get predictions and metrics on the training data
        training_pred = self.binary_lr_trainer.predict_proba(
            X=sparse.csr_matrix(self.x_train), offsets=None)
        training_metrics = self.binary_lr_trainer.compute_metrics(
            X=sparse.csr_matrix(self.x_train), y=self.y_train, offsets=None)
        # Assert prediction shape matches expectation, and training metrics are within expected range
        assert (0.0 <= training_metrics['auc'] <= 1.0)
        assert (training_pred.shape[0] == self.x_train.shape[0])

    def test_on_sparse_dataset_without_bias(self):
        """
        Test training on a sparse dataset
        """
        # Train on sparsified sample data
        self.binary_lr_trainer_without_bias.fit(X=sparse.csr_matrix(
            self.x_train),
                                                y=self.y_train,
                                                weights=None,
                                                offsets=None)

        # Get predictions and metrics on the training data
        training_pred = self.binary_lr_trainer_without_bias.predict_proba(
            X=sparse.csr_matrix(self.x_train), offsets=None)
        training_metrics = self.binary_lr_trainer_without_bias.compute_metrics(
            X=sparse.csr_matrix(self.x_train), y=self.y_train, offsets=None)
        # Assert prediction shape matches expectation, and training metrics are within expected range
        assert (0.0 <= training_metrics['auc'] <= 1.0)
        assert (training_pred.shape[0] == self.x_train.shape[0])

    def test_scoring_on_validation_data(self):
        """
        Test inference and metrics computation
        """
        # Train on sample data
        self.binary_lr_trainer.fit(X=sparse.csr_matrix(self.x_train),
                                   y=self.y_train,
                                   weights=None,
                                   offsets=None)

        # Get predictions and metrics on the test data
        validation_pred = self.binary_lr_trainer.predict_proba(X=self.x_test,
                                                               offsets=None)
        validation_metrics = self.binary_lr_trainer.compute_metrics(
            X=self.x_test, y=self.y_test, offsets=None)

        # Assert prediction shape matches expectation, and training metrics are within expected range
        assert (0.0 <= validation_metrics['auc'] <= 1.0)
        assert (validation_pred.shape[0] == self.x_test.shape[0])

    def test_scoring_should_fail_if_not_trained(self):
        """
        Inference should fail on untrained model
        """
        # Reset trainer object
        self.binary_lr_trainer = BinaryLogisticRegressionTrainer()
        with self.assertRaises(Exception):
            self.binary_lr_trainer.predict_proba(X=self.x_test, offsets=None)

    def test_scoring_should_fail_if_custom_weights_not_of_known_type(self):
        """
        Inference should fail if custom weights are neither Numpy ndarray or Scipy sparse amtrix
        """
        # Reset trainer object
        self.binary_lr_trainer = BinaryLogisticRegressionTrainer()
        # Run inference using a Python list, which is neither a numpy ndarray nor a scipy matrix
        with self.assertRaises(Exception):
            self.binary_lr_trainer.predict_proba(
                X=self.x_test,
                offsets=None,
                custom_theta=self.custom_weights.tolist())

    def test_metrics_computation_should_fail_if_model_not_trained(self):
        """
        Metrics computation should fail on untrained model
        """
        # Reset trainer object
        self.binary_lr_trainer = BinaryLogisticRegressionTrainer()
        with self.assertRaises(Exception):
            self.binary_lr_trainer.compute_metrics(X=self.x_test,
                                                   y=self.y_test,
                                                   offsets=None)

    def test_scoring_should_succeed_if_custom_weights_provided(self):
        """
        Inference should succeed on untrained model if custom weights provided
        """
        # Reset trainer object
        self.binary_lr_trainer = BinaryLogisticRegressionTrainer()
        validation_pred = self.binary_lr_trainer.predict_proba(
            X=self.x_test, offsets=None, custom_theta=self.custom_weights)
        assert (validation_pred.shape[0] == self.x_test.shape[0])

    def test_metrics_computation_should_succeed_if_custom_weights_provided(
            self):
        """
        Metrics computation should succeed on untrained model if custom weights provided
        """
        # Reset trainer object
        self.binary_lr_trainer = BinaryLogisticRegressionTrainer()
        validation_metrics = self.binary_lr_trainer.compute_metrics(
            X=self.x_test,
            y=self.y_test,
            offsets=None,
            custom_theta=self.custom_weights)
        assert (0.0 <= validation_metrics['auc'] <= 1.0)

    def test_training_with_warm_start(self):
        """
        Training with a user provided model for warm start.
        """
        # Get trainer object, but only train 1 L-BFGS step.
        binary_lr_trainer = BinaryLogisticRegressionTrainer(lambda_l2=0.0,
                                                            max_iter=1)
        coefficients_warm_start = binary_lr_trainer.fit(
            X=self.x_train,
            y=self.y_train,
            weights=None,
            offsets=None,
            theta_initial=self.custom_weights)[0][0]
        # Warm start.
        # The trained model should be close to initial value
        # since the solution should have already converged.
        self.assertAllClose(coefficients_warm_start,
                            self.custom_weights,
                            rtol=_TOLERANCE,
                            atol=_TOLERANCE,
                            msg='models mismatch')

        coefficients_cold_start = binary_lr_trainer.fit(
            X=self.x_train,
            y=self.y_train,
            weights=None,
            offsets=None,
            theta_initial=None)[0][0]
        # Cold start
        # The trained model should be far from initial value since we only train 1 step,
        # while the initial model was trained for 100 steps.
        self.assertNotAllClose(coefficients_cold_start,
                               self.custom_weights,
                               msg='models are too close')

    def test_fit_with_variance_computation(self):
        """
        Test fit when the variance computation is required
        """
        # Generate the dataset
        num_features = 10
        num_samples = 100
        X = np.random.randn(num_samples, num_features)
        y = np.random.randint(2, size=num_samples)
        weights = np.random.rand(num_samples)
        offsets = np.random.randn(num_samples)
        lambda_l2 = 0.0
        binary_lr_trainer = BinaryLogisticRegressionTrainer(
            lambda_l2=lambda_l2, max_iter=1000, regularize_bias=True)
        expected_simple = compute_coefficients_and_variance(
            X=X,
            y=y,
            weights=weights,
            offsets=offsets,
            variance_mode=constants.SIMPLE,
            lambda_l2=lambda_l2)

        expected_full = compute_coefficients_and_variance(
            X=X,
            y=y,
            weights=weights,
            offsets=offsets,
            variance_mode=constants.FULL,
            lambda_l2=lambda_l2)

        actual_simple = binary_lr_trainer.fit(X=sparse.csr_matrix(X),
                                              y=y,
                                              weights=weights,
                                              offsets=offsets,
                                              variance_mode=constants.SIMPLE)

        actual_full = binary_lr_trainer.fit(X=sparse.csr_matrix(X),
                                            y=y,
                                            weights=weights,
                                            offsets=offsets,
                                            variance_mode=constants.FULL)
        self.assertAllClose(expected_simple[0],
                            actual_simple[0][0],
                            rtol=1e-02,
                            atol=1e-02,
                            msg='simple mean mismatch')
        self.assertAllClose(expected_simple[1],
                            actual_simple[1],
                            rtol=1e-02,
                            atol=1e-02,
                            msg='simple variance mismatch')
        self.assertAllClose(expected_full[0],
                            actual_full[0][0],
                            rtol=1e-02,
                            atol=1e-02,
                            msg='full mean mismatch')
        self.assertAllClose(expected_full[1],
                            actual_full[1],
                            rtol=1e-02,
                            atol=1e-02,
                            msg='full variance mismatch')

    def test_fit_with_variance_computation_without_intercept(self):
        """
        Test fit when the variance computation is required but no intercept is used
        """
        # Generate the dataset
        num_features = 10
        num_samples = 100
        X = np.random.randn(num_samples, num_features)
        y = np.random.randint(2, size=num_samples)
        weights = np.random.rand(num_samples)
        offsets = np.random.randn(num_samples)
        lambda_l2 = 0.0
        binary_lr_trainer = BinaryLogisticRegressionTrainer(
            lambda_l2=lambda_l2,
            max_iter=1000,
            regularize_bias=True,
            has_intercept=False)
        expected_simple = compute_coefficients_and_variance(
            X=X,
            y=y,
            weights=weights,
            offsets=offsets,
            variance_mode=constants.SIMPLE,
            lambda_l2=lambda_l2,
            has_intercept=False)

        expected_full = compute_coefficients_and_variance(
            X=X,
            y=y,
            weights=weights,
            offsets=offsets,
            variance_mode=constants.FULL,
            lambda_l2=lambda_l2,
            has_intercept=False)

        actual_simple = binary_lr_trainer.fit(X=sparse.csr_matrix(X),
                                              y=y,
                                              weights=weights,
                                              offsets=offsets,
                                              variance_mode=constants.SIMPLE)

        actual_full = binary_lr_trainer.fit(X=sparse.csr_matrix(X),
                                            y=y,
                                            weights=weights,
                                            offsets=offsets,
                                            variance_mode=constants.FULL)
        self.assertAllClose(expected_simple[0],
                            actual_simple[0][0],
                            rtol=1e-02,
                            atol=1e-02,
                            msg='simple mean mismatch')
        self.assertAllClose(expected_simple[1],
                            actual_simple[1],
                            rtol=1e-02,
                            atol=1e-02,
                            msg='simple variance mismatch')
        self.assertAllClose(expected_full[0],
                            actual_full[0][0],
                            rtol=1e-02,
                            atol=1e-02,
                            msg='full mean mismatch')
        self.assertAllClose(expected_full[1],
                            actual_full[1],
                            rtol=1e-02,
                            atol=1e-02,
                            msg='full variance mismatch')
    def test_fit_with_variance_computation_without_intercept(self):
        """
        Test fit when the variance computation is required but no intercept is used
        """
        # Generate the dataset
        num_features = 10
        num_samples = 100
        X = np.random.randn(num_samples, num_features)
        y = np.random.randint(2, size=num_samples)
        weights = np.random.rand(num_samples)
        offsets = np.random.randn(num_samples)
        lambda_l2 = 0.0
        binary_lr_trainer = BinaryLogisticRegressionTrainer(
            lambda_l2=lambda_l2,
            max_iter=1000,
            regularize_bias=True,
            has_intercept=False)
        expected_simple = compute_coefficients_and_variance(
            X=X,
            y=y,
            weights=weights,
            offsets=offsets,
            variance_mode=constants.SIMPLE,
            lambda_l2=lambda_l2,
            has_intercept=False)

        expected_full = compute_coefficients_and_variance(
            X=X,
            y=y,
            weights=weights,
            offsets=offsets,
            variance_mode=constants.FULL,
            lambda_l2=lambda_l2,
            has_intercept=False)

        actual_simple = binary_lr_trainer.fit(X=sparse.csr_matrix(X),
                                              y=y,
                                              weights=weights,
                                              offsets=offsets,
                                              variance_mode=constants.SIMPLE)

        actual_full = binary_lr_trainer.fit(X=sparse.csr_matrix(X),
                                            y=y,
                                            weights=weights,
                                            offsets=offsets,
                                            variance_mode=constants.FULL)
        self.assertAllClose(expected_simple[0],
                            actual_simple[0][0],
                            rtol=1e-02,
                            atol=1e-02,
                            msg='simple mean mismatch')
        self.assertAllClose(expected_simple[1],
                            actual_simple[1],
                            rtol=1e-02,
                            atol=1e-02,
                            msg='simple variance mismatch')
        self.assertAllClose(expected_full[0],
                            actual_full[0][0],
                            rtol=1e-02,
                            atol=1e-02,
                            msg='full mean mismatch')
        self.assertAllClose(expected_full[1],
                            actual_full[1],
                            rtol=1e-02,
                            atol=1e-02,
                            msg='full variance mismatch')