Ejemplo n.º 1
0
    def test_valid_input(self):
        expected_result = [
            datetime.datetime(2015, 3, 1, 0, 0),
            datetime.datetime(2015, 6, 1, 0, 0),
            datetime.datetime(2015, 9, 1, 0, 0),
            datetime.datetime(2015, 12, 1, 0, 0),
            datetime.datetime(2016, 3, 1, 0, 0),
            datetime.datetime(2016, 6, 1, 0, 0),
        ]
        chopper = Timechop(
            feature_start_time=datetime.datetime(2010, 1, 1, 0, 0),
            feature_end_time=datetime.datetime(2017, 1, 1, 0, 0),
            label_start_time=datetime.datetime(2015, 1, 1, 0, 0),
            label_end_time=datetime.datetime(2017, 1, 1, 0, 0),
            model_update_frequency="3 months",
            training_as_of_date_frequencies=["1 day"],
            test_as_of_date_frequencies=["1 day"],
            max_training_histories=["1 year"],
            test_durations=["6 months"],
            test_label_timespans=["1 months"],
            training_label_timespans=["3 days"],
        )

        # this should throw an exception because last possible label date is after
        # end of feature time
        result = chopper.calculate_train_test_split_times(
            training_label_timespan=convert_str_to_relativedelta("3 days"),
            test_duration="6 months",
            test_label_timespan=convert_str_to_relativedelta("1 month"),
        )

        assert result == expected_result
Ejemplo n.º 2
0
    def chop_time(self):
        """ Given the attributes of the object, define all train/test splits
        for all combinations of the temporal parameters.

        :return: a list of dictionaries defining train/test splits
        :rtype: list
        """
        matrix_set_definitions = []
        # in our example, we just have one value for each of these: 6month, 6month, and 3month
        for training_label_timespan, test_label_timespan, test_duration in itertools.product(
                self.training_label_timespans, self.test_label_timespans,
                self.test_durations):
            # calculating the train-test split times starts from the end and walks backwards
            # e.g., train_test_split_times for our example with a 1 year model_update_frequency
            # will be every Oct. 1 from 2012 to 2016 (see comments in the method for details
            # on the calculation):
            # train_test_split_times = [2012-10-01, 2013-10-01, 2014-10-01, 2015-10-01, 2016-10-01]
            logging.info(
                'Calculating train/test split times for training prediction span {}, '
                'test prediction span {}, test span {}'.format(
                    training_label_timespan, test_label_timespan,
                    test_duration))
            train_test_split_times = self.calculate_train_test_split_times(
                training_label_timespan=convert_str_to_relativedelta(
                    training_label_timespan),
                test_label_timespan=convert_str_to_relativedelta(
                    test_label_timespan),
                test_duration=test_duration)
            logging.info(
                'Train/test split times: {}'.format(train_test_split_times))

            # handle each training_as_of_date_frequency and max_training_history separately
            # to create matrices for each train_test_split_time.
            # in our example, we only have one value for each: 1day and 2year
            for training_as_of_date_frequency, max_training_history in itertools.product(
                    self.training_as_of_date_frequencies,
                    self.max_training_histories):
                logging.info(
                    'Generating matrix definitions for training_as_of_date_frequency {}, '
                    'max_training_history {}'.format(
                        training_as_of_date_frequency, max_training_history))
                for train_test_split_time in train_test_split_times:
                    logging.info(
                        'Generating matrix definitions for split {}'.format(
                            train_test_split_time))
                    matrix_set_definitions.append(
                        self.generate_matrix_definitions(
                            train_test_split_time=train_test_split_time,
                            training_as_of_date_frequency=
                            training_as_of_date_frequency,
                            max_training_history=max_training_history,
                            test_duration=test_duration,
                            training_label_timespan=training_label_timespan,
                            test_label_timespan=test_label_timespan))
        return (matrix_set_definitions)
Ejemplo n.º 3
0
 def _validate_time_intervals(self, intervals):
     logging.info('Validating time intervals')
     for interval in intervals:
         if interval != 'all':
             # this function, used elsewhere to break up time intervals,
             # will throw an error if the interval can't be converted to a
             # relativedelta
             try:
                 convert_str_to_relativedelta(interval)
             except Exception as e:
                 raise ValueError(dedent('''
                 Section: feature_aggregations -
                 Time interval is invalid.
                 interval: "{}"
                 Full error: {}'''.format(interval, e)))
Ejemplo n.º 4
0
def test_calculate_as_of_times_three_day_freq():
    expected_result = [
        datetime.datetime(2011, 1, 1, 0, 0),
        datetime.datetime(2011, 1, 4, 0, 0),
        datetime.datetime(2011, 1, 7, 0, 0),
        datetime.datetime(2011, 1, 10, 0, 0),
    ]
    chopper = Timechop(
        feature_start_time=datetime.datetime(1990, 1, 1, 0, 0),
        feature_end_time=datetime.datetime(2012, 1, 1, 0, 0),
        label_start_time=datetime.datetime(2010, 1, 1, 0, 0),
        label_end_time=datetime.datetime(2012, 1, 1, 0, 0),
        model_update_frequency='1 year',
        training_as_of_date_frequencies=['1 days'],
        test_as_of_date_frequencies=['7 days'],
        max_training_histories=['10 days', '1 year'],
        test_durations=['1 month'],
        test_label_timespans=['1 day'],
        training_label_timespans=['3 months']
    )
    result = chopper.calculate_as_of_times(
        as_of_start_limit=datetime.datetime(2011, 1, 1, 0, 0),
        as_of_end_limit=datetime.datetime(2011, 1, 11, 0, 0),
        data_frequency=convert_str_to_relativedelta('3 days'),
        forward=True
    )
    assert result == expected_result
Ejemplo n.º 5
0
def test_calculate_as_of_times_one_day_freq():
    expected_result = [
        datetime.datetime(2011, 1, 1, 0, 0),
        datetime.datetime(2011, 1, 2, 0, 0),
        datetime.datetime(2011, 1, 3, 0, 0),
        datetime.datetime(2011, 1, 4, 0, 0),
        datetime.datetime(2011, 1, 5, 0, 0),
        datetime.datetime(2011, 1, 6, 0, 0),
        datetime.datetime(2011, 1, 7, 0, 0),
        datetime.datetime(2011, 1, 8, 0, 0),
        datetime.datetime(2011, 1, 9, 0, 0),
        datetime.datetime(2011, 1, 10, 0, 0),
        datetime.datetime(2011, 1, 11, 0, 0),
    ]
    chopper = Timechop(
        feature_start_time=datetime.datetime(1990, 1, 1, 0, 0),
        feature_end_time=datetime.datetime(2012, 1, 1, 0, 0),
        label_start_time=datetime.datetime(2010, 1, 1, 0, 0),
        label_end_time=datetime.datetime(2012, 1, 1, 0, 0),
        model_update_frequency="1 year",
        training_as_of_date_frequencies=["1 days"],
        test_as_of_date_frequencies=["7 days"],
        max_training_histories=["10 days", "1 year"],
        test_durations=["1 month"],
        test_label_timespans=["1 day"],
        training_label_timespans=["3 months"],
    )
    result = chopper.calculate_as_of_times(
        as_of_start_limit=datetime.datetime(2011, 1, 1, 0, 0),
        as_of_end_limit=datetime.datetime(2011, 1, 11, 0, 0),
        data_frequency=convert_str_to_relativedelta("1 days"),
    )
    assert result == expected_result
Ejemplo n.º 6
0
 def _validate_time_intervals(self, intervals):
     logger.spam("Validating time intervals")
     for interval in intervals:
         if interval != "all":
             # this function, used elsewhere to break up time intervals,
             # will throw an error if the interval can't be converted to a
             # relativedelta
             try:
                 convert_str_to_relativedelta(interval)
                 logger.debug("Validation of time intervals was successful")
             except Exception as e:
                 raise ValueError(
                     dedent("""
                 Section: feature_aggregations -
                 Time interval is invalid.
                 interval: "{}"
                 Full error: {}""".format(interval, e)))
Ejemplo n.º 7
0
    def __init__(
        self,
        feature_start_time,
        feature_end_time,
        label_start_time,
        label_end_time,
        model_update_frequency,
        training_as_of_date_frequencies,
        max_training_histories,
        training_label_timespans,
        test_as_of_date_frequencies,
        test_durations,
        test_label_timespans,
    ):
        self.feature_start_time = dt_from_str(
            feature_start_time)  # earliest time included in any feature
        self.feature_end_time = dt_from_str(
            feature_end_time)  # all data included in features are < this time
        if self.feature_start_time > self.feature_end_time:
            raise ValueError("Feature start time after feature end time.")

        self.label_start_time = dt_from_str(
            label_start_time)  # earliest time included in any label
        self.label_end_time = dt_from_str(
            label_end_time)  # all data in any label are < this time
        if self.label_start_time > self.label_end_time:
            raise ValueError("Label start time after label end time.")

        # how frequently to retrain models
        self.model_update_frequency = convert_str_to_relativedelta(
            model_update_frequency)

        # time between rows for same entity in train matrix
        self.training_as_of_date_frequencies = utils.convert_to_list(
            training_as_of_date_frequencies)

        # time between rows for same entity in test matrix
        self.test_as_of_date_frequencies = utils.convert_to_list(
            test_as_of_date_frequencies)

        # how much history for each entity to train on
        self.max_training_histories = utils.convert_to_list(
            max_training_histories)

        # how long into the future to make predictions for each entity
        self.test_durations = utils.convert_to_list(test_durations)

        # how much time is included in a label in the train matrix
        self.training_label_timespans = utils.convert_to_list(
            training_label_timespans)

        # how much time is included in a label in the test matrix
        self.test_label_timespans = utils.convert_to_list(test_label_timespans)
Ejemplo n.º 8
0
    def test_labels_after_features(self):
        chopper = Timechop(
            feature_start_time=datetime.datetime(2010, 1, 1, 0, 0),
            feature_end_time=datetime.datetime(2016, 1, 1, 0, 0),
            label_start_time=datetime.datetime(2015, 1, 1, 0, 0),
            label_end_time=datetime.datetime(2017, 1, 1, 0, 0),
            model_update_frequency="3 months",
            training_as_of_date_frequencies=["1 day"],
            test_as_of_date_frequencies=["1 day"],
            max_training_histories=["1 year"],
            test_durations=["6 months"],
            test_label_timespans=["1 months"],
            training_label_timespans=["3 days"],
        )

        # this should throw an exception because last possible label date is after
        # end of feature time
        with self.assertRaises(ValueError):
            chopper.calculate_train_test_split_times(
                training_label_timespan=convert_str_to_relativedelta("3 days"),
                test_duration="6 months",
                test_label_timespan=convert_str_to_relativedelta("1 month"),
            )
Ejemplo n.º 9
0
    def test_no_valid_label_dates(self):
        chopper = Timechop(
            feature_start_time=datetime.datetime(2010, 1, 1, 0, 0),
            feature_end_time=datetime.datetime(2016, 1, 1, 0, 0),
            label_start_time=datetime.datetime(2015, 1, 1, 0, 0),
            label_end_time=datetime.datetime(2015, 2, 1, 0, 0),
            model_update_frequency="3 months",
            training_as_of_date_frequencies=["1 day"],
            test_as_of_date_frequencies=["1 day"],
            max_training_histories=["1 year"],
            test_durations=["6 months"],
            test_label_timespans=["1 months"],
            training_label_timespans=["3 days"],
        )

        # this should raise an error because there are no valid label dates in
        # the labeling time (label span is longer than labeling time)
        with self.assertRaises(ValueError):
            chopper.calculate_train_test_split_times(
                training_label_timespan=convert_str_to_relativedelta("3 days"),
                test_duration="6 months",
                test_label_timespan=convert_str_to_relativedelta("1 month"),
            )
Ejemplo n.º 10
0
    def get_temporal_config_for_retrain(self, prediction_date):
        temporal_config = self.experiment_config['temporal_config'].copy()
        temporal_config['feature_end_time'] = datetime.strftime(
            prediction_date, "%Y-%m-%d")
        temporal_config['label_end_time'] = datetime.strftime(
            prediction_date +
            convert_str_to_relativedelta(self.test_label_timespan), "%Y-%m-%d")
        # just needs to be bigger than the gap between the label start and end times
        # to ensure we only get one time split for the retraining
        temporal_config['model_update_frequency'] = '%syears' % (
            dt_from_str(temporal_config['label_end_time']).year -
            dt_from_str(temporal_config['label_start_time']).year + 10)

        return temporal_config
Ejemplo n.º 11
0
    def _run(self, temporal_config):
        def dt_from_str(dt_str):
            return datetime.strptime(dt_str, "%Y-%m-%d")

        splits = []
        try:
            chopper = Timechop(
                feature_start_time=dt_from_str(
                    temporal_config["feature_start_time"]),
                feature_end_time=dt_from_str(
                    temporal_config["feature_end_time"]),
                label_start_time=dt_from_str(
                    temporal_config["label_start_time"]),
                label_end_time=dt_from_str(temporal_config["label_end_time"]),
                model_update_frequency=temporal_config[
                    "model_update_frequency"],
                training_label_timespans=temporal_config[
                    "training_label_timespans"],
                test_label_timespans=temporal_config["test_label_timespans"],
                training_as_of_date_frequencies=temporal_config[
                    "training_as_of_date_frequencies"],
                test_as_of_date_frequencies=temporal_config[
                    "test_as_of_date_frequencies"],
                max_training_histories=temporal_config[
                    "max_training_histories"],
                test_durations=temporal_config["test_durations"],
            )
            splits = chopper.chop_time()
        except Exception as e:
            raise ValueError(
                dedent("""
            Section: temporal_config -
            Timechop could not produce temporal splits from config {}.
            Error: {}
            """.format(temporal_config, e)))
        for split_num, split in enumerate(splits):
            if len(split["train_matrix"]["as_of_times"]) == 0:
                raise ValueError(
                    dedent("""
                Section: temporal_config -
                Computed split {} has a train matrix with no as_of_times.
                """.format(split)))

            # timechop computes the last time available to train data
            # and stores it in the matrix as 'matrix_info_end_time'
            # but to be more sure, let's double-check by comparing as_of_times
            # in the train and all associated test matrices
            train_max_data_time = max(
                split["train_matrix"]
                ["as_of_times"]) + convert_str_to_relativedelta(
                    split["train_matrix"]["training_label_timespan"])

            for test_matrix in split["test_matrices"]:
                if len(test_matrix["as_of_times"]) == 0:
                    raise ValueError(
                        dedent("""
                    Section: temporal_config -
                    Computed split {} has a test matrix with no as_of_times.
                    """.format(split)))
                overlapping_times = [
                    as_of_time for as_of_time in test_matrix["as_of_times"]
                    if as_of_time < train_max_data_time
                ]
                if overlapping_times:
                    raise ValueError(
                        dedent("""
                    Section: temporal_config -
                    Computed split index {} has a test matrix with as_of_times {}
                    < the maximum train as_of_time + train label timespan.
                    ({}). This is likely an error in timechop. See the
                    experiment's split_definitions[{}] for more information""".
                               format(
                                   split_num,
                                   overlapping_times,
                                   train_max_data_time,
                                   split_num,
                               )))
Ejemplo n.º 12
0
    def define_test_matrices(self, train_test_split_time, test_duration,
                             test_label_timespan):
        """ Given a train/test split time and a set of testing parameters,
        generate the metadata and as of times for the test matrices in a split.

        :param train_test_split_time: the limit of the last label in the matrix
        :type train_test_split_time: datetime.datetime
        :param test_duration: how far forward from split do test as_of_times go
        :type test_duration: str
        :param test_label_timespan: how much time is covered by test labels
        :type test_label_timespan: str

        :return: list of dictionaries defining the test matrices for a split
        :rtype: list
        """

        # for our example, this will be called with:
        #   train_test_split_time = 2016-10-01
        #   test_duration = 3month
        #   test_label_timespan = 6month

        # the as_of_time_limit is simply the split time plus the test_duration and we
        # can avoid checking here for any issues with the label_end_time or
        # feature_end_time since we've guaranteed that those limits would be
        # satisfied when we calculated the train_test_split_times initially
        #
        # for the example, as_of_time_limit = 2016-10-01 + 3month = 2017-01-01
        # (note as well that this will be treated as an _exclusive_ limit)
        logging.info(
            "Generating test matrix definitions for train/test split %s",
            train_test_split_time)
        test_definitions = []
        test_delta = convert_str_to_relativedelta(test_duration)
        as_of_time_limit = train_test_split_time + test_delta
        logging.info("All test as of times before %s", as_of_time_limit)

        # calculate the as_of_times associated with each test data frequency
        # for our example, we just have one, 1month
        for test_as_of_date_frequency in self.test_as_of_date_frequencies:
            logging.info(
                "Generating test matrix definitions for test data frequency %s",
                test_as_of_date_frequency)

            # for test as_of_times we step _forwards_ from the train_test_split_time
            # to ensure that we always have a prediction set made immediately after
            # training is done (so, the freshest possible predictions) even if the
            # frequency doesn't divide the test_duration evenly so there's a gap before
            # the as_of_time_limit
            #
            # for our example, this will give three as_of_dates:
            #   [2016-10-01, 2016-11-01, 2016-12-01]
            # since we start at the train_test_split_time (2016-10-01) and walk forward by
            # the test_as_of_date_frequency (1 month) until we've exhausted the test_duration
            # (3 months), exclusive (see comments in the method for details)
            test_as_of_times = self.calculate_as_of_times(
                as_of_start_limit=train_test_split_time,
                as_of_end_limit=as_of_time_limit,
                data_frequency=convert_str_to_relativedelta(
                    test_as_of_date_frequency),
                forward=True,
            )
            logging.info("test as of times: %s", test_as_of_times)
            test_definition = {
                "first_as_of_time":
                train_test_split_time,
                "last_as_of_time":
                max(test_as_of_times),
                "matrix_info_end_time":
                max(test_as_of_times) +
                convert_str_to_relativedelta(test_label_timespan),
                "as_of_times":
                AsOfTimeList(test_as_of_times),
                "test_label_timespan":
                test_label_timespan,
                "test_as_of_date_frequency":
                test_as_of_date_frequency,
                "test_duration":
                test_duration,
            }
            test_definitions.append(test_definition)
        return test_definitions
Ejemplo n.º 13
0
def show_timechop(chopper,
                  show_as_of_times=True,
                  show_boundaries=True,
                  file_name=None):

    plt.close('all')

    chops = chopper.chop_time()

    chops.reverse()

    fig, ax = plt.subplots(len(chops),
                           sharex=True,
                           sharey=True,
                           figsize=FIG_SIZE)

    for idx, chop in enumerate(chops):
        train_as_of_times = chop['train_matrix']['as_of_times']
        test_as_of_times = chop['test_matrices'][0]['as_of_times']

        max_training_history = chop['train_matrix']['max_training_history']
        test_label_timespan = chop['test_matrices'][0]['test_label_timespan']
        training_label_timespan = chop['train_matrix'][
            'training_label_timespan']

        color_rgb = np.random.random(3)

        if (show_as_of_times):
            # Train matrix (as_of_times)
            ax[idx].hlines(
                [x for x in range(len(train_as_of_times))],
                [x.date() for x in train_as_of_times], [
                    x.date() +
                    convert_str_to_relativedelta(training_label_timespan)
                    for x in train_as_of_times
                ],
                linewidth=3,
                color=color_rgb,
                label=f"train_{idx}")

            # Test matrix
            ax[idx].hlines([x for x in range(len(test_as_of_times))], [
                x.date() for x in test_as_of_times
            ], [
                x.date() + convert_str_to_relativedelta(test_label_timespan)
                for x in test_as_of_times
            ],
                           linewidth=3,
                           color=color_rgb,
                           label=f"test_{idx}")

        if (show_boundaries):
            # Limits: train
            ax[idx].axvspan(chop['train_matrix']['first_as_of_time'],
                            chop['train_matrix']['last_as_of_time'],
                            color=color_rgb,
                            alpha=0.3)

            ax[idx].axvline(chop['train_matrix']['matrix_info_end_time'],
                            color='k',
                            linestyle='--')

            # Limits: test
            ax[idx].axvspan(chop['test_matrices'][0]['first_as_of_time'],
                            chop['test_matrices'][0]['last_as_of_time'],
                            color=color_rgb,
                            alpha=0.3)

            ax[idx].axvline(chop['feature_start_time'],
                            color='k',
                            linestyle='--',
                            alpha=0.2)
            ax[idx].axvline(chop['feature_end_time'],
                            color='k',
                            linestyle='--',
                            alpha=0.2)
            ax[idx].axvline(chop['label_start_time'],
                            color='k',
                            linestyle='--',
                            alpha=0.2)
            ax[idx].axvline(chop['label_end_time'],
                            color='k',
                            linestyle='--',
                            alpha=0.2)

            ax[idx].axvline(chop['test_matrices'][0]['matrix_info_end_time'],
                            color='k',
                            linestyle='--')

        ax[idx].yaxis.set_major_locator(plt.NullLocator())
        ax[idx].yaxis.set_label_position("right")
        ax[idx].set_ylabel(f"Block {idx}", rotation='horizontal', labelpad=30)

        ax[idx].xaxis.set_major_formatter(md.DateFormatter('%Y'))
        ax[idx].xaxis.set_major_locator(md.YearLocator())
        ax[idx].xaxis.set_minor_locator(md.MonthLocator())

    ax[0].set_title('Timechop: Temporal cross-validation blocks')
    fig.subplots_adjust(hspace=0)
    plt.setp([a.get_xticklabels() for a in fig.axes[:-1]], visible=False)

    file_name = os.path.join(TRIAGE_OUTPUT_PATH, "images", file_name)
    fig.savefig(file_name)

    plt.show()

    return file_name
Ejemplo n.º 14
0
 def test_valid_input(self):
     date = datetime.datetime(2016, 1, 1, 0, 0)
     for test in (
         {
             'interval': '1 year',
             'addition_result': datetime.datetime(2017, 1, 1, 0, 0),
             'subtraction_result': datetime.datetime(2015, 1, 1, 0, 0)
         },
         {
             'interval': '2 months',
             'addition_result': datetime.datetime(2016, 3, 1, 0, 0),
             'subtraction_result': datetime.datetime(2015, 11, 1, 0, 0)
         },
         {
             'interval': '3 days',
             'addition_result': datetime.datetime(2016, 1, 4, 0, 0),
             'subtraction_result': datetime.datetime(2015, 12, 29, 0, 0)
         },
         {
             'interval': '3 seconds',
             'addition_result': datetime.datetime(2016, 1, 1, 0, 0, 3),
             'subtraction_result':
             datetime.datetime(2015, 12, 31, 23, 59, 57)
         },
         {
             'interval': '2year',
             'addition_result': datetime.datetime(2018, 1, 1, 0, 0),
             'subtraction_result': datetime.datetime(2014, 1, 1, 0, 0)
         },
         {
             'interval': '4 weeks',
             'addition_result': datetime.datetime(2016, 1, 29, 0, 0),
             'subtraction_result': datetime.datetime(2015, 12, 4, 0, 0)
         },
         {
             'interval': '5 hours',
             'addition_result': datetime.datetime(2016, 1, 1, 5, 0),
             'subtraction_result': datetime.datetime(2015, 12, 31, 19, 0)
         },
         {
             'interval': '10minutes',
             'addition_result': datetime.datetime(2016, 1, 1, 0, 10),
             'subtraction_result': datetime.datetime(2015, 12, 31, 23, 50)
         },
         {
             'interval':
             '1microsecond',
             'addition_result':
             datetime.datetime(2016, 1, 1, 0, 0, 0, 1),
             'subtraction_result':
             datetime.datetime(2015, 12, 31, 23, 59, 59, 999999)
         },
         {
             'interval': '5m',
             'addition_result': datetime.datetime(2016, 1, 1, 0, 5),
             'subtraction_result': datetime.datetime(2015, 12, 31, 23, 55),
         },
     ):
         delta = convert_str_to_relativedelta(test['interval'])
         assert date + delta == test['addition_result']
         assert date - delta == test['subtraction_result']
Ejemplo n.º 15
0
    def define_train_matrix(
        self,
        train_test_split_time,
        training_label_timespan,
        max_training_history,
        training_as_of_date_frequency,
    ):
        """ Given a split time and the parameters of a training matrix, generate
        the as of times and metadata for a train matrix.

        :param train_test_split_time: the limit of the last label in the matrix
        :type train_test_split_time: datetime.datetime
        :param training_label_timespan: how much time is covered by the labels
        :type training_label_timespan: str
        :param max_training_history: how far back from split do as_of_times go
        :type max_training_history: str
        :param training_as_of_date_frequency: how much time between rows for an entity
        :type training_as_of_date_frequency: str

        :return: dictionary containing the temporal parameters and as of times
                 for a train matrix
        :rtype: dict
        """

        # for our example, this will be called with:
        #   train_test_split_time = 2016-10-01
        #   training_label_timespan = 6month
        #   max_training_history = 2year
        #   training_as_of_date_frequency = 1day

        # last as of time in the matrix is 1 label span before split to provide
        # enough of a buffer for the label data to avoid spilling into the test
        # matrix and causing a leakage problem.
        #
        # e.g., last_train_as_of_time = 2016-10-01 - 6month = 2016-04-01
        training_prediction_delta = convert_str_to_relativedelta(
            training_label_timespan)
        last_train_as_of_time = train_test_split_time - training_prediction_delta

        # earliest time in matrix can't be farther back than the latest of the beginning
        # of label time or the beginning of feature time -- whichever is latest is the
        # limit if the amount of history we want to take would go further back.
        #
        # e.g., 2016-04-01 - 2year = 2014-04-01, which is later than both our
        # label_start_time (2012-01-01) and our feature_start_time (1995-01-01), so we
        # can use earliest_possible_train_as_of_time = 2014-04-01
        max_training_delta = convert_str_to_relativedelta(max_training_history)
        earliest_possible_train_as_of_time = last_train_as_of_time - max_training_delta
        experiment_as_of_time_limit = max(self.label_start_time,
                                          self.feature_start_time)
        if earliest_possible_train_as_of_time < experiment_as_of_time_limit:
            earliest_possible_train_as_of_time = experiment_as_of_time_limit
        logging.info("earliest possible train as of time: %s",
                     earliest_possible_train_as_of_time)

        # with the last as of time and the earliest possible time known,
        # calculate all the as of times for the matrix, stepping backwards
        # from the last as of time (to ensure that we use the latest possible
        # training data even if there's a gap and things don't line up
        # exactly) by the training_as_of_date_frequency
        #
        # for our example, this will give us a list of every day from 2014-04-01
        # through 2016-04-01, including _both_ endpoints
        train_as_of_times = self.calculate_as_of_times(
            as_of_start_limit=earliest_possible_train_as_of_time,
            as_of_end_limit=last_train_as_of_time,
            data_frequency=convert_str_to_relativedelta(
                training_as_of_date_frequency),
        )
        logging.info("train as of times: %s", train_as_of_times)

        # create a dict of the matrix metadata
        matrix_definition = {
            "first_as_of_time": min(train_as_of_times),
            "last_as_of_time": max(train_as_of_times),
            "matrix_info_end_time": train_test_split_time,
            "as_of_times": AsOfTimeList(train_as_of_times),
            "training_label_timespan": training_label_timespan,
            "training_as_of_date_frequency": training_as_of_date_frequency,
            "max_training_history": max_training_history,
        }

        return matrix_definition
Ejemplo n.º 16
0
    def calculate_train_test_split_times(self, training_label_timespan,
                                         test_label_timespan, test_duration):
        """ Calculate the split times between train and test matrices. All
        label spans in train matrices will end at this time, and this will be
        the first as of time in the respective test matrix.

        :param training_label_timespan: how much time is included in training
                                         labels
        :type training_label_timespan: dateutil.relativedelta.relativedelta
        :param test_label_timespan: how much time is included in test labels
        :type test_label_timespan: dateutil.relativedelta.relativedelta
        :param test_duration: for how long after the end of a training matrix are
                          test predictions made
        :type test_duration: str

        :return: all split times for the temporal parameters
        :rtype: list

        :raises: ValueError if there are no valid split times in the temporal
                 config
        """

        # we always want to be sure we're using the most recent data, so for the splits,
        # we start from the very end of time for which we have labels and walk backwards,
        # ensuring we leave enough of a buffer for the test_label_timespan to get a full
        # set of labels for our last testing as_of_date
        #
        # in our example, last_test_label_time = 2017-07-01 - 6month = 2017-01-01
        last_test_label_time = self.label_end_time - test_label_timespan

        # final label must be able to have feature data associated with it
        if last_test_label_time > self.feature_end_time:
            last_test_label_time = self.feature_end_time
            raise ValueError(
                "Final test label date cannot be after end of feature time.")
        logging.info("Final label as of date: {}".format(last_test_label_time))

        # all split times have to allow at least one training label before them
        # e.g., earliest_possible_split_time = max(1995-01-01, 2012-01-01) + 6month = 2012-01-01
        earliest_possible_split_time = training_label_timespan + max(
            self.feature_start_time, self.label_start_time)
        logging.info("Earliest possible train/test split time: {}".format(
            earliest_possible_split_time))

        # last split is the first as of time in the final test matrix
        # that is, starting from the label_end_time, we've walked back by the test_label_timespan
        # (above) to allow a buffer for labels and now we walk back further by the test_duration to
        # ensure we have a full set of test data in the latest test matrix.
        #
        # e.g., last_split_time = 2017-01-01 - 3month = 2016-10-01
        test_delta = convert_str_to_relativedelta(test_duration)
        last_split_time = last_test_label_time - test_delta
        logging.info("Final split time: {}".format(last_split_time))
        if last_split_time < earliest_possible_split_time:
            raise ValueError(
                "No valid train/test split times in temporal config.")

        train_test_split_times = []
        train_test_split_time = last_split_time

        # finally, starting from our last_split_time, simply step backwards by the
        # model_update_frequency until we hit the earliest allowable time to
        # yield the set of train_test_split_times
        #
        # e.g., train_test_split_times for our example with a 1 year model_update_frequency
        # will be every Oct. 1 from 2012 to 2016:
        # train_test_split_times = [2012-10-01, 2013-10-01, 2014-10-01, 2015-10-01, 2016-10-01]
        while train_test_split_time >= earliest_possible_split_time:
            train_test_split_times.insert(0, train_test_split_time)
            train_test_split_time -= self.model_update_frequency

        return train_test_split_times
Ejemplo n.º 17
0
 def test_bad_input(self):
     for bad_delta_string in ('4 tacos', '3'):
         with self.assertRaises(ValueError):
             convert_str_to_relativedelta(bad_delta_string)
Ejemplo n.º 18
0
    def __init__(
        self,
        feature_start_time,
        feature_end_time,
        label_start_time,
        label_end_time,
        model_update_frequency,
        training_as_of_date_frequencies,
        max_training_histories,
        training_label_timespans,
        test_as_of_date_frequencies,
        test_durations,
        test_label_timespans,
    ):

        '''
        Date strings should follow the format `YYYY-MM-DD`. Date intervals
        should be strings of the Postgres interval input format.

        This class is often used within the Triage experiment pipeline, and
        initialized using parameters from a Triage [experiment config](../../../experiments/experiment-config/#time-splitting)

        Arguments:
            feature_start_time (str): Earliest date included in any feature
            feature_end_time (str): Day after last feature date (all data
                included in features are before this date)
            label_start_time (str): Earliest date for which labels are available
            label_end_time (str): Day AFTER last label date (all dates in any
                model are before this date)
            model_update_frequency (str): how frequently to retrain models
            training_as_of_date_frequencies (str): time between rows for same
                entity in train matrix
            max_training_histories (str): Interval specifying how much history
                for each entity to train on
            training_label_timespans (str): how much time is included in a label
                in the train matrix
            test_as_of_date_frequencies (str): time between rows for same entity
                in test matrix
            test_durations (str): How long into the future to make predictions
                for each entity. Controls the length of time included in a test
                matrix
            test_label_timespans (str): How much time is included in a label
                in the test matrix.
        '''
        self.feature_start_time = dt_from_str(
            feature_start_time
        )
        self.feature_end_time = dt_from_str(
            feature_end_time
        )
        if self.feature_start_time > self.feature_end_time:
            raise ValueError("Feature start time after feature end time.")

        self.label_start_time = dt_from_str(
            label_start_time
        )
        self.label_end_time = dt_from_str(
            label_end_time
        )
        if self.label_start_time > self.label_end_time:
            raise ValueError("Label start time after label end time.")

        self.model_update_frequency = convert_str_to_relativedelta(
            model_update_frequency
        )

        self.training_as_of_date_frequencies = utils.convert_to_list(
            training_as_of_date_frequencies
        )

        self.test_as_of_date_frequencies = utils.convert_to_list(
            test_as_of_date_frequencies
        )

        self.max_training_histories = utils.convert_to_list(max_training_histories)

        self.test_durations = utils.convert_to_list(test_durations)

        self.training_label_timespans = utils.convert_to_list(training_label_timespans)

        self.test_label_timespans = utils.convert_to_list(test_label_timespans)
Ejemplo n.º 19
0
 def test_valid_input(self):
     date = datetime.datetime(2016, 1, 1, 0, 0)
     for test in (
         {
             "interval": "1 year",
             "addition_result": datetime.datetime(2017, 1, 1, 0, 0),
             "subtraction_result": datetime.datetime(2015, 1, 1, 0, 0),
         },
         {
             "interval": "2 months",
             "addition_result": datetime.datetime(2016, 3, 1, 0, 0),
             "subtraction_result": datetime.datetime(2015, 11, 1, 0, 0),
         },
         {
             "interval": "3 days",
             "addition_result": datetime.datetime(2016, 1, 4, 0, 0),
             "subtraction_result": datetime.datetime(2015, 12, 29, 0, 0),
         },
         {
             "interval": "3 seconds",
             "addition_result": datetime.datetime(2016, 1, 1, 0, 0, 3),
             "subtraction_result":
             datetime.datetime(2015, 12, 31, 23, 59, 57),
         },
         {
             "interval": "2year",
             "addition_result": datetime.datetime(2018, 1, 1, 0, 0),
             "subtraction_result": datetime.datetime(2014, 1, 1, 0, 0),
         },
         {
             "interval": "4 weeks",
             "addition_result": datetime.datetime(2016, 1, 29, 0, 0),
             "subtraction_result": datetime.datetime(2015, 12, 4, 0, 0),
         },
         {
             "interval": "5 hours",
             "addition_result": datetime.datetime(2016, 1, 1, 5, 0),
             "subtraction_result": datetime.datetime(2015, 12, 31, 19, 0),
         },
         {
             "interval": "10minutes",
             "addition_result": datetime.datetime(2016, 1, 1, 0, 10),
             "subtraction_result": datetime.datetime(2015, 12, 31, 23, 50),
         },
         {
             "interval":
             "1microsecond",
             "addition_result":
             datetime.datetime(2016, 1, 1, 0, 0, 0, 1),
             "subtraction_result":
             datetime.datetime(2015, 12, 31, 23, 59, 59, 999999),
         },
         {
             "interval": "5m",
             "addition_result": datetime.datetime(2016, 1, 1, 0, 5),
             "subtraction_result": datetime.datetime(2015, 12, 31, 23, 55),
         },
     ):
         delta = convert_str_to_relativedelta(test["interval"])
         assert date + delta == test["addition_result"]
         assert date - delta == test["subtraction_result"]
Ejemplo n.º 20
0
 def _validate_time_intervals(self, intervals):
     logging.info("Validating time intervals")
     for interval in intervals:
         if interval != "all":
             convert_str_to_relativedelta(interval)
Ejemplo n.º 21
0
def visualize_chops(chopper, show_as_of_times=True, show_boundaries=True, save_target=None):
    """Visualize time chops of a given Timechop object using matplotlib

    Args:
        chopper (triage.component.timechop.Timechop): A fully-configured Timechop object
        show_as_of_times (bool): Whether or not to draw horizontal lines
            for as-of-times
        show_boundaries (bool): Whether or not to show a rectangle around matrices
            and dashed lines around feature/label boundaries
        save_target (str or file-like object): A save target for matplotlib to save
            the figure to. Defaults to None, which won't save anything
    """
    chops = chopper.chop_time()

    chops.reverse()

    fig, ax = plt.subplots(nrows=len(chops), sharex=True, sharey=True, squeeze=False, figsize=FIG_SIZE)

    for idx, chop in enumerate(chops):
        train_as_of_times = chop["train_matrix"]["as_of_times"]
        test_as_of_times = chop["test_matrices"][0]["as_of_times"]

        test_label_timespan = chop["test_matrices"][0]["test_label_timespan"]
        training_label_timespan = chop["train_matrix"]["training_label_timespan"]

        color_rgb = np.random.random(3)

        if show_as_of_times:
            # Train matrix (as_of_times)
            ax[idx][0].hlines(
                [x for x in range(len(train_as_of_times))],
                [x.date() for x in train_as_of_times],
                [
                    x.date() + convert_str_to_relativedelta(training_label_timespan)
                    for x in train_as_of_times
                ],
                linewidth=3,
                color=color_rgb,
                label=f"train_{idx}",
            )

            # Test matrix
            ax[idx][0].hlines(
                [x for x in range(len(test_as_of_times))],
                [x.date() for x in test_as_of_times],
                [
                    x.date() + convert_str_to_relativedelta(test_label_timespan)
                    for x in test_as_of_times
                ],
                linewidth=3,
                color=color_rgb,
                label=f"test_{idx}",
            )

        if show_boundaries:
            # Limits: train
            ax[idx][0].axvspan(
                chop["train_matrix"]["first_as_of_time"],
                chop["train_matrix"]["last_as_of_time"],
                color=color_rgb,
                alpha=0.3,
            )

            ax[idx][0].axvline(
                chop["train_matrix"]["matrix_info_end_time"], color="k", linestyle="--"
            )

            # Limits: test
            ax[idx][0].axvspan(
                chop["test_matrices"][0]["first_as_of_time"],
                chop["test_matrices"][0]["last_as_of_time"],
                color=color_rgb,
                alpha=0.3,
            )

            ax[idx][0].axvline(
                chop["feature_start_time"], color="k", linestyle="--", alpha=0.2
            )
            ax[idx][0].axvline(
                chop["feature_end_time"], color="k", linestyle="--", alpha=0.2
            )
            ax[idx][0].axvline(
                chop["label_start_time"], color="k", linestyle="--", alpha=0.2
            )
            ax[idx][0].axvline(
                chop["label_end_time"], color="k", linestyle="--", alpha=0.2
            )

            ax[idx][0].axvline(
                chop["test_matrices"][0]["matrix_info_end_time"],
                color="k",
                linestyle="--",
            )

        ax[idx][0].yaxis.set_major_locator(plt.NullLocator())
        ax[idx][0].yaxis.set_label_position("right")
        ax[idx][0].set_ylabel(f'Label timespan \n {test_label_timespan} (test), {training_label_timespan} (training)',
                              rotation="vertical", labelpad=30)

        ax[idx][0].xaxis.set_major_formatter(md.DateFormatter("%Y"))
        ax[idx][0].xaxis.set_major_locator(md.YearLocator())
        ax[idx][0].xaxis.set_minor_locator(md.MonthLocator())

    ax[0][0].set_title("Timechop: Temporal cross-validation blocks")
    fig.subplots_adjust(hspace=0)
    plt.setp([a.get_xticklabels() for a in fig.axes[:-1]], visible=False)
    if save_target:
        plt.savefig(save_target)
    plt.show()
Ejemplo n.º 22
0
def visualize_chops(chopper, show_as_of_times=True, show_boundaries=True):
    """Visualize time chops of a given Timechop object using matplotlib

    Args:
        chopper (triage.component.timechop.Timechop) A fully-configured Timechop object
        show_as_of_times (bool, default True) Whether or not to draw horizontal lines for as-of-times
        show_boundaries (bool, default True) Whether or not to show a rectangle around matrices
            and dashed lines around feature/label boundaries
    """
    chops = chopper.chop_time()

    chops.reverse()

    fig, ax = plt.subplots(len(chops),
                           sharex=True,
                           sharey=True,
                           figsize=FIG_SIZE)

    for idx, chop in enumerate(chops):
        train_as_of_times = chop['train_matrix']['as_of_times']
        test_as_of_times = chop['test_matrices'][0]['as_of_times']

        test_label_timespan = chop['test_matrices'][0]['test_label_timespan']
        training_label_timespan = chop['train_matrix'][
            'training_label_timespan']

        color_rgb = np.random.random(3)

        if (show_as_of_times):
            # Train matrix (as_of_times)
            ax[idx].hlines(
                [x for x in range(len(train_as_of_times))],
                [x.date() for x in train_as_of_times], [
                    x.date() +
                    convert_str_to_relativedelta(training_label_timespan)
                    for x in train_as_of_times
                ],
                linewidth=3,
                color=color_rgb,
                label=f"train_{idx}")

            # Test matrix
            ax[idx].hlines([x for x in range(len(test_as_of_times))], [
                x.date() for x in test_as_of_times
            ], [
                x.date() + convert_str_to_relativedelta(test_label_timespan)
                for x in test_as_of_times
            ],
                           linewidth=3,
                           color=color_rgb,
                           label=f"test_{idx}")

        if (show_boundaries):
            # Limits: train
            ax[idx].axvspan(chop['train_matrix']['first_as_of_time'],
                            chop['train_matrix']['last_as_of_time'],
                            color=color_rgb,
                            alpha=0.3)

            ax[idx].axvline(chop['train_matrix']['matrix_info_end_time'],
                            color='k',
                            linestyle='--')

            # Limits: test
            ax[idx].axvspan(chop['test_matrices'][0]['first_as_of_time'],
                            chop['test_matrices'][0]['last_as_of_time'],
                            color=color_rgb,
                            alpha=0.3)

            ax[idx].axvline(chop['feature_start_time'],
                            color='k',
                            linestyle='--',
                            alpha=0.2)
            ax[idx].axvline(chop['feature_end_time'],
                            color='k',
                            linestyle='--',
                            alpha=0.2)
            ax[idx].axvline(chop['label_start_time'],
                            color='k',
                            linestyle='--',
                            alpha=0.2)
            ax[idx].axvline(chop['label_end_time'],
                            color='k',
                            linestyle='--',
                            alpha=0.2)

            ax[idx].axvline(chop['test_matrices'][0]['matrix_info_end_time'],
                            color='k',
                            linestyle='--')

        ax[idx].yaxis.set_major_locator(plt.NullLocator())
        ax[idx].yaxis.set_label_position("right")
        ax[idx].set_ylabel(f"Block {idx}", rotation='horizontal', labelpad=30)

        ax[idx].xaxis.set_major_formatter(md.DateFormatter('%Y'))
        ax[idx].xaxis.set_major_locator(md.YearLocator())
        ax[idx].xaxis.set_minor_locator(md.MonthLocator())

    ax[0].set_title('Timechop: Temporal cross-validation blocks')
    fig.subplots_adjust(hspace=0)
    plt.setp([a.get_xticklabels() for a in fig.axes[:-1]], visible=False)
    plt.show()