Beispiel #1
0
    def test_acquisition_date_greater_max_date(self):
        """
        GIVEN a sampler and data for a customer whose acquisition date is greater \
            than the samplers `max_date`
        WHEN the sampler is applied to the data
        THEN it returns an empty dataframe
        """
        customer_data = self.generate_data_for_one_customer(
            1,
            min_date="2016-01-01",  # will be overwritten by acquisition date
            max_date="2020-08-01",
            acquisition_date="2020-01-01",
            n_orders=12,
        )
        sampler = BinnedUniformSampler(
            min_date="2016-01-01",
            max_date="2019-12-31",
            lead_time="28d",
            prediction_period="180d",
            lookback="180d",
            samples_per_lookback=1,
        )
        samples = sampler.generate_samples(customer_data)

        assert samples.empty
Beispiel #2
0
 def test_instantiation_from_config(self, config: Dict[str, Any]):
     """
     GIVEN a valid config for a BinnedUniformSampler
     WHEN the classmethod `from_config` is called with this config
     THEN a BinnedUniformSampler is instantiated.
     """
     BinnedUniformSampler.from_config(config)
Beispiel #3
0
    def test_y_data_but_no_x_data(self):
        """
        GIVEN a sampler and dummy data for a customer with no purchases falling into the \
            lookback period
        WHEN samples are generated from that customers data
        THEN there are no orderpositions marked with `x_include` in the result. 
        """
        lead_time = pd.to_timedelta("1d")
        lookback = pd.to_timedelta("2y")
        prediction_period = pd.to_timedelta("180d")
        max_date = pd.to_datetime("2020-01-01")
        min_date = (max_date - prediction_period - lead_time - lookback -
                    pd.to_timedelta("1d"))

        sampler = BinnedUniformSampler(
            min_date=min_date,
            max_date=max_date,
            lead_time=lead_time,
            prediction_period=prediction_period,
            samples_per_lookback=1,
            lookback=lookback,
        )

        # purchase that covers one prediction period, but no lookback
        customer_data = self.generate_data_for_one_customer(
            1, max_date - prediction_period - lead_time, max_date, n_orders=12)

        # a sampler with max date greater than the maximum order date and
        # params such that exactly one sample is created for the customer
        samples = sampler.generate_samples(customer_data)

        assert samples.index.get_level_values("sample_id").nunique() == 1
        assert samples.x_include.sum() == 0
        assert samples.y_include.sum() >= 1
Beispiel #4
0
 def test_equals_w_equal_instances(self, config: Dict[str, Any]):
     """
     GIVEN keyword arguments for a BinnedUniformSampler
     WHEN two samplers are instantiated from the same config
     THEN they are equal.
     """
     first_sampler = BinnedUniformSampler(**config)
     second_sampler = BinnedUniformSampler(**config)
     assert first_sampler == second_sampler
Beispiel #5
0
 def test_instantiation_from_to_dict_output(self,
                                            sampler: BinnedUniformSampler):
     """
     GIVEN the output from a samplers `to_dict` method
     WHEN a new sampler is instantiated from this output via the `from_config` classmethod
     THEN the two resulting samplers are equal.
     """
     representation = sampler.to_dict()
     clone = BinnedUniformSampler.from_config(representation)
     assert clone == sampler
Beispiel #6
0
 def test_equals_wo_equal_instance(self, sampler: BinnedUniformSampler,
                                   config: Dict[str, Any]):
     """
     GIVEN a valid set of keyword arguments for a BinnedUniformSampler
     WHEN one sampler is instantiated with that config and another sampler is instantiated \
         with a slightly changed set of keyword arguments
     THEN the two samplers are not equal.
     """
     first_sampler = BinnedUniformSampler(**config)
     config["lookback"] = str(sampler.lookback / 2)
     second_sampler = BinnedUniformSampler(**config)
     assert first_sampler != second_sampler
Beispiel #7
0
    def test_result_is_reproduceable_within_sampler(self, config, raw_data,
                                                    seed):
        """
        GIVEN a sampler instantiated with a random seed
        WHEN this sampler is applied to the same data twice
        THEN the results of both applications are equal
        """
        sampler = BinnedUniformSampler(random_seed=seed, **config)

        first_result = sampler.generate_samples(raw_data)
        second_result = sampler.generate_samples(raw_data)

        pd.testing.assert_frame_equal(first_result, second_result)
Beispiel #8
0
 def test_to_dict_is_yaml_serializable(self, sampler: BinnedUniformSampler):
     """
     GIVEN an instantiated BinnedUniformSampler
     WHEN the instance's `to_dict` method is called and the output is dumped to YAML
     THEN no exception is raised.
     """
     yaml.dump(sampler.to_dict())
Beispiel #9
0
    def test_samples_for_new_customers(
        self,
        min_date,
        max_date,
        lookback,
        prediction_period,
        lead_time,
        samples_per_lookback,
    ):
        """
        GIVEN a set of parameters for a sampler
        WHEN samples are generated from a customer's purchase history that does not cover a \
            full lookback
        THEN the number of samples generated for this customer is still proportional to the \
            time period covered by the customers purchase history and there is at least one \
            sample regardless of the available purchase history.
        """
        min_date = pd.to_datetime(min_date)
        max_date = pd.to_datetime(max_date)
        lookback = pd.to_timedelta(lookback)
        prediction_period = pd.to_timedelta(prediction_period)
        lead_time = pd.to_timedelta(lead_time)
        samples_per_lookback = 3

        acquisition_date = (max_date - (lookback * np.random.uniform()) -
                            prediction_period - lead_time)
        new_customer_data = self.generate_data_for_one_customer(
            1, min_date, max_date, acquisition_date)

        sampler = BinnedUniformSampler(
            min_date=min_date,
            max_date=max_date,
            lead_time=lead_time,
            prediction_period=prediction_period,
            lookback=lookback,
            samples_per_lookback=samples_per_lookback,
        )
        samples = sampler.generate_samples(new_customer_data)

        n_samples = samples.index.get_level_values("sample_id").nunique()

        lookbacks_covered = ((max_date - prediction_period - lead_time) -
                             acquisition_date) / lookback
        expected_n_samples = np.max(
            [np.floor(lookbacks_covered * samples_per_lookback), 1])
        assert expected_n_samples == n_samples
        assert np.all(n_samples > 0)
Beispiel #10
0
    def sampler(self, config: Dict[str, Any]) -> BinnedUniformSampler:
        """Fixture that provides an instantiated sampler.

        :param config: The config-fixture for the sampler.
        :type config: Dict[str,Any]
        :return: The instantiated sampler
        :rtype: BinnedUniformSampler
        """
        return BinnedUniformSampler.from_config(config)
Beispiel #11
0
 def test_instantiation_with_missing_field_fails(self, missing_field: str,
                                                 config: Dict[str, Any]):
     """
     GIVEN a config with a missing specification for a parameter. 
     WHEN a BinnedUniformSampler is instantiated with the missing keyword argument.
     THEN a TypeError is raised
     """
     del config[missing_field]
     with pytest.raises(expected_exception=TypeError):
         _ = BinnedUniformSampler(**config)
Beispiel #12
0
 def test_no_prediction_time_outside_min_and_max_date(
         self, sampler: BinnedUniformSampler, raw_data: pd.DataFrame):
     """
     GIVEN a sampler instance and some dummy purchase data for multiple customers
     WHEN samples are created from this data using the sampler
     THEN the prediction times of all samples is within `[sampler.min_date, sampler.max_date]`
     """
     sampled = sampler.generate_samples(raw_data)
     max_date = sampler.max_date
     min_date = sampler.min_date
     assert np.all(sampled.prediction_time > min_date)
     assert np.all(sampled.prediction_time < max_date)
Beispiel #13
0
    def test_instantiation_with_wrong_data_type_fails(self, param: str,
                                                      value_wrong_type: Any,
                                                      config: Dict[str, Any]):
        """
        GIVEN a keyword argument's value of the wrong data type.
        WHEN a BinnedRandomSampler is instantiated with the wrong value
        THEN a ValueError is raised.
        """

        config[param] = value_wrong_type
        with pytest.raises(ValueError):
            BinnedUniformSampler(**config)
Beispiel #14
0
 def test_no_data_in_lead_time(self, sampler: BinnedUniformSampler,
                               raw_data: pd.DataFrame):
     """
     GIVEN a sampler with a lead time and some dummy purchase data to create samples from
     WHEN samples are generated from this purchase data
     THEN there is no order position marked in the column `y_include` in the lead time
     """
     sampled = sampler.generate_samples(raw_data)
     y_data = sampled[sampled.y_include]
     lead_time = sampler.lead_time
     lower_boundary = lead_time + y_data.prediction_time
     assert np.all(y_data.order_date > lower_boundary)
Beispiel #15
0
 def test_sampled_data_columns(
     self,
     sampler: BinnedUniformSampler,
     expected_column: str,
     raw_data: pd.DataFrame,
 ):
     """
     GIVEN a sampler and dummy purchase data for multiple customers
     WHEN samples are created from this dummy data using the sampler
     THEN the output dataframe from the sampler contains all mandatory columns.
     """
     sampled = sampler.generate_samples(raw_data)
     columns = list(sampled.columns) + list(sampled.index.names)
     assert expected_column in columns
Beispiel #16
0
    def test_result_is_reproduceable_between_samplers(self, config, raw_data,
                                                      seed):
        """
        GIVEN two samplers with the same random seed and the same other parameters 
        WHEN both samlers are applied to the same raw data
        THEN the result is the same
        """
        first_sampler = BinnedUniformSampler(random_seed=seed, **config)
        first_result = first_sampler.generate_samples(raw_data)

        second_sampler = BinnedUniformSampler(random_seed=seed, **config)
        second_result = second_sampler.generate_samples(raw_data)

        pd.testing.assert_frame_equal(first_result, second_result)
Beispiel #17
0
    def test_different_seeds_produce_different_outcomes(
            self, config, raw_data, seed):
        """
        GIVEN two samplers with the same parameters but different random seeds
        WHEN both samplers are applied to the same raw data
        THEN the result is different
        """
        first_sampler = BinnedUniformSampler(random_seed=seed, **config)
        second_sampler = BinnedUniformSampler(random_seed=seed + 1, **config)

        first_result = first_sampler.generate_samples(raw_data)
        second_result = second_sampler.generate_samples(raw_data)

        try:
            pd.testing.assert_frame_equal(first_result, second_result)
        except AssertionError:
            pass
        else:
            raise AssertionError
Beispiel #18
0
 def test_instantiation_with_all_params(self, config):
     BinnedUniformSampler(**config)
Beispiel #19
0
    def test_samples_within_bins(
        self,
        min_date,
        max_date,
        lead_time,
        prediction_period,
        lookback,
        samples_per_lookback,
        random_seed,
    ):
        """
        GIVEN a set of parameters for a sampler
        WHEN samples are generated from one customers data using a sampler instantiated \
            with these parameters
        THEN all samples for this customers fall into equi-spaced bins of the customers's \
            purchase history.
        """
        np.random.seed(random_seed)

        # generate data for timespan
        customer_data = self.generate_data_for_one_customer(
            1, min_date, max_date)

        # initialize sampler
        sampler = BinnedUniformSampler(
            min_date=min_date,
            max_date=max_date,
            lead_time=lead_time,
            prediction_period=prediction_period,
            lookback=lookback,
            samples_per_lookback=samples_per_lookback,
        )
        min_date = pd.to_datetime(min_date)
        max_date = pd.to_datetime(max_date)
        lead_time = pd.to_timedelta(lead_time)
        prediction_period = pd.to_timedelta(prediction_period)
        lookback = pd.to_timedelta(lookback)

        samples = sampler.generate_samples(customer_data)

        # calculate boundaries of the sampling range
        upper = max_date - prediction_period - lead_time
        lower = max(min_date, customer_data.acquisition_date.max())

        # calculate the number of samples
        lookbacks_covered = (upper - lower) / lookback
        n_samples_expected = np.floor(lookbacks_covered * samples_per_lookback)
        n_samples_expected = n_samples_expected.astype(int)
        # at least one sample if customer has enough data for one prediction period
        if upper > lower:
            n_samples_expected = max(n_samples_expected, 1)
        else:
            n_samples_expected = 0

        # full lookbacks for customers with enough data
        if lower < (upper - lookback):
            lower = lower + lookback
        lower = max(lower, min_date + lookback)

        # calculate the size of the individual bins for sampling
        bin_size = (upper - lower) / n_samples_expected

        # sort prediction times into bins
        prediction_times = samples.groupby("sample_id").prediction_time.first()
        bins = np.floor((prediction_times - lower) / bin_size)

        # check that we actually generated the expected number of samples
        assert (samples.index.get_level_values("sample_id").nunique() ==
                n_samples_expected)
        # check that every sample falls into its own bin
        assert bins.nunique() == n_samples_expected