Example #1
0
def test_calculate_as_of_times_one_day_freq():
    expected_result = [
        datetime.datetime(2011, 1, 1, 0, 0),
        datetime.datetime(2011, 1, 2, 0, 0),
        datetime.datetime(2011, 1, 3, 0, 0),
        datetime.datetime(2011, 1, 4, 0, 0),
        datetime.datetime(2011, 1, 5, 0, 0),
        datetime.datetime(2011, 1, 6, 0, 0),
        datetime.datetime(2011, 1, 7, 0, 0),
        datetime.datetime(2011, 1, 8, 0, 0),
        datetime.datetime(2011, 1, 9, 0, 0),
        datetime.datetime(2011, 1, 10, 0, 0),
        datetime.datetime(2011, 1, 11, 0, 0),
    ]
    chopper = Timechop(
        feature_start_time=datetime.datetime(1990, 1, 1, 0, 0),
        feature_end_time=datetime.datetime(2012, 1, 1, 0, 0),
        label_start_time=datetime.datetime(2010, 1, 1, 0, 0),
        label_end_time=datetime.datetime(2012, 1, 1, 0, 0),
        model_update_frequency="1 year",
        training_as_of_date_frequencies=["1 days"],
        test_as_of_date_frequencies=["7 days"],
        max_training_histories=["10 days", "1 year"],
        test_durations=["1 month"],
        test_label_timespans=["1 day"],
        training_label_timespans=["3 months"],
    )
    result = chopper.calculate_as_of_times(
        as_of_start_limit=datetime.datetime(2011, 1, 1, 0, 0),
        as_of_end_limit=datetime.datetime(2011, 1, 11, 0, 0),
        data_frequency=convert_str_to_relativedelta("1 days"),
    )
    assert result == expected_result
Example #2
0
def test_calculate_as_of_times_three_day_freq():
    expected_result = [
        datetime.datetime(2011, 1, 1, 0, 0),
        datetime.datetime(2011, 1, 4, 0, 0),
        datetime.datetime(2011, 1, 7, 0, 0),
        datetime.datetime(2011, 1, 10, 0, 0),
    ]
    chopper = Timechop(
        feature_start_time=datetime.datetime(1990, 1, 1, 0, 0),
        feature_end_time=datetime.datetime(2012, 1, 1, 0, 0),
        label_start_time=datetime.datetime(2010, 1, 1, 0, 0),
        label_end_time=datetime.datetime(2012, 1, 1, 0, 0),
        model_update_frequency='1 year',
        training_as_of_date_frequencies=['1 days'],
        test_as_of_date_frequencies=['7 days'],
        max_training_histories=['10 days', '1 year'],
        test_durations=['1 month'],
        test_label_timespans=['1 day'],
        training_label_timespans=['3 months']
    )
    result = chopper.calculate_as_of_times(
        as_of_start_limit=datetime.datetime(2011, 1, 1, 0, 0),
        as_of_end_limit=datetime.datetime(2011, 1, 11, 0, 0),
        data_frequency=convert_str_to_relativedelta('3 days'),
        forward=True
    )
    assert result == expected_result
Example #3
0
    def test_valid_input(self):
        expected_result = [
            datetime.datetime(2015, 3, 1, 0, 0),
            datetime.datetime(2015, 6, 1, 0, 0),
            datetime.datetime(2015, 9, 1, 0, 0),
            datetime.datetime(2015, 12, 1, 0, 0),
            datetime.datetime(2016, 3, 1, 0, 0),
            datetime.datetime(2016, 6, 1, 0, 0),
        ]
        chopper = Timechop(
            feature_start_time=datetime.datetime(2010, 1, 1, 0, 0),
            feature_end_time=datetime.datetime(2017, 1, 1, 0, 0),
            label_start_time=datetime.datetime(2015, 1, 1, 0, 0),
            label_end_time=datetime.datetime(2017, 1, 1, 0, 0),
            model_update_frequency="3 months",
            training_as_of_date_frequencies=["1 day"],
            test_as_of_date_frequencies=["1 day"],
            max_training_histories=["1 year"],
            test_durations=["6 months"],
            test_label_timespans=["1 months"],
            training_label_timespans=["3 days"],
        )

        # this should throw an exception because last possible label date is after
        # end of feature time
        result = chopper.calculate_train_test_split_times(
            training_label_timespan=convert_str_to_relativedelta("3 days"),
            test_duration="6 months",
            test_label_timespan=convert_str_to_relativedelta("1 month"),
        )

        assert result == expected_result
Example #4
0
 def test_look_back_time_equal_modeling_start(self):
     # TODO: rework this test since the test label window of 3 months
     # cannot be satisfied by the 10 day difference between modeling
     # start and end times, so it's not a very realistic case
     expected_result = {
         'feature_start_time': datetime.datetime(1990, 1, 1, 0, 0),
         'label_start_time': datetime.datetime(2010, 1, 1, 0, 0),
         'feature_end_time': datetime.datetime(2010, 1, 11, 0, 0),
         'label_end_time': datetime.datetime(2010, 1, 11, 0, 0),
         'train_matrix': {
             'first_as_of_time': datetime.datetime(2010, 1, 1, 0, 0),
             'last_as_of_time': datetime.datetime(2010, 1, 5, 0, 0),
             'matrix_info_end_time': datetime.datetime(2010, 1, 6, 0, 0),
             'as_of_times': [
                 datetime.datetime(2010, 1, 1, 0, 0),
                 datetime.datetime(2010, 1, 2, 0, 0),
                 datetime.datetime(2010, 1, 3, 0, 0),
                 datetime.datetime(2010, 1, 4, 0, 0),
                 datetime.datetime(2010, 1, 5, 0, 0)
             ],
             'training_label_timespan': '1 day',
             'training_as_of_date_frequency': '1 days',
             'max_training_history': '5 days'
         },
         'test_matrices': [{
             'first_as_of_time': datetime.datetime(2010, 1, 6, 0, 0),
             'last_as_of_time': datetime.datetime(2010, 1, 9, 0, 0),
             'matrix_info_end_time': datetime.datetime(2010, 1, 10, 0, 0),
             'as_of_times': [
                 datetime.datetime(2010, 1, 6, 0, 0),
                 datetime.datetime(2010, 1, 9, 0, 0)
             ],
             'test_label_timespan': '1 day',
             'test_as_of_date_frequency': '3 days',
             'test_duration': '5 days'
         }]
     }
     chopper = Timechop(
         feature_start_time=datetime.datetime(1990, 1, 1, 0, 0),
         feature_end_time=datetime.datetime(2010, 1, 11, 0, 0),
         label_start_time=datetime.datetime(2010, 1, 1, 0, 0),
         label_end_time=datetime.datetime(2010, 1, 11, 0, 0),
         model_update_frequency='5 days',
         training_as_of_date_frequencies=['1 days'],
         test_as_of_date_frequencies=['3 days'],
         max_training_histories=['5 days'],
         test_durations=['5 days'],
         test_label_timespans=['1 day'],
         training_label_timespans=['1 day']
     )
     result = chopper.generate_matrix_definitions(
         train_test_split_time=datetime.datetime(2010, 1, 6, 0, 0),
         training_as_of_date_frequency='1 days',
         max_training_history='5 days',
         test_duration='5 days',
         test_label_timespan='1 day',
         training_label_timespan='1 day'
     )
     assert result == expected_result
Example #5
0
 def __call__(self, args):
     experiment_config = yaml.load(args.config)
     if 'temporal_config' not in experiment_config:
         raise ValueError('Passed configuration must have `temporal_config` key '
                          'in order to visualize time chops')
     chopper = Timechop(**(experiment_config['temporal_config']))
     logging.info('Visualizing time chops')
     visualize_chops(chopper)
Example #6
0
 def __call__(self, args):
     experiment_config = yaml.load(args.config)
     if "temporal_config" not in experiment_config:
         raise ValueError(
             "Passed configuration must have `temporal_config` key "
             "in order to visualize time chops")
     chopper = Timechop(**(experiment_config["temporal_config"]))
     logging.info("Visualizing time chops")
     visualize_chops(chopper)
Example #7
0
 def chopper(self):
     # create a valid Timechop chopper
     # least brittle current way of doing this is by loading the
     # example_experiment_config.yaml file, because that is a
     # diligently updated file. If Timechop config changes, the
     # example config should change too
     with open("example_experiment_config.yaml") as fd:
         experiment_config = yaml.load(fd)
     return Timechop(**(experiment_config["temporal_config"]))
Example #8
0
 def test_training_label_timespan_longer_than_1_day(self):
     expected_result = [
         {
             "feature_start_time": datetime.datetime(1990, 1, 1, 0, 0),
             "label_start_time": datetime.datetime(2010, 1, 1, 0, 0),
             "feature_end_time": datetime.datetime(2010, 1, 19, 0, 0),
             "label_end_time": datetime.datetime(2010, 1, 19, 0, 0),
             "train_matrix": {
                 "first_as_of_time": datetime.datetime(2010, 1, 1, 0, 0),
                 "last_as_of_time": datetime.datetime(2010, 1, 4, 0, 0),
                 "matrix_info_end_time": datetime.datetime(2010, 1, 9, 0, 0),
                 "as_of_times": [
                     datetime.datetime(2010, 1, 1, 0, 0),
                     datetime.datetime(2010, 1, 2, 0, 0),
                     datetime.datetime(2010, 1, 3, 0, 0),
                     datetime.datetime(2010, 1, 4, 0, 0),
                 ],
                 "training_label_timespan": "5 days",
                 "training_as_of_date_frequency": "1 days",
                 "max_training_history": "5 days",
             },
             "test_matrices": [
                 {
                     "first_as_of_time": datetime.datetime(2010, 1, 9, 0, 0),
                     "last_as_of_time": datetime.datetime(2010, 1, 13, 0, 0),
                     "matrix_info_end_time": datetime.datetime(2010, 1, 18, 0, 0),
                     "as_of_times": [
                         datetime.datetime(2010, 1, 9, 0, 0),
                         datetime.datetime(2010, 1, 10, 0, 0),
                         datetime.datetime(2010, 1, 11, 0, 0),
                         datetime.datetime(2010, 1, 12, 0, 0),
                         datetime.datetime(2010, 1, 13, 0, 0),
                     ],
                     "test_label_timespan": "5 days",
                     "test_as_of_date_frequency": "1 days",
                     "test_duration": "5 days",
                 }
             ],
         }
     ]
     chopper = Timechop(
         feature_start_time=datetime.datetime(1990, 1, 1, 0, 0),
         feature_end_time=datetime.datetime(2010, 1, 19, 0, 0),
         label_start_time=datetime.datetime(2010, 1, 1, 0, 0),
         label_end_time=datetime.datetime(2010, 1, 19, 0, 0),
         model_update_frequency="5 days",
         training_as_of_date_frequencies=["1 days"],
         test_as_of_date_frequencies=["1 days"],
         max_training_histories=["5 days"],
         test_durations=["5 days"],
         test_label_timespans=["5 days"],
         training_label_timespans=["5 days"],
     )
     result = chopper.chop_time()
     assert result == expected_result
Example #9
0
 def test_training_label_timespan_longer_than_1_day(self):
     expected_result = [
         {
             'feature_start_time': datetime.datetime(1990, 1, 1, 0, 0),
             'label_start_time': datetime.datetime(2010, 1, 1, 0, 0),
             'feature_end_time': datetime.datetime(2010, 1, 19, 0, 0),
             'label_end_time': datetime.datetime(2010, 1, 19, 0, 0),
             'train_matrix': {
                 'first_as_of_time': datetime.datetime(2010, 1, 1, 0, 0),
                 'last_as_of_time': datetime.datetime(2010, 1, 4, 0, 0),
                 'matrix_info_end_time': datetime.datetime(2010, 1, 9, 0, 0),
                 'as_of_times': [
                     datetime.datetime(2010, 1, 1, 0, 0),
                     datetime.datetime(2010, 1, 2, 0, 0),
                     datetime.datetime(2010, 1, 3, 0, 0),
                     datetime.datetime(2010, 1, 4, 0, 0)
                 ],
                 'training_label_timespan': '5 days',
                 'training_as_of_date_frequency': '1 days',
                 'max_training_history': '5 days'
             },
             'test_matrices': [{
                 'first_as_of_time': datetime.datetime(2010, 1, 9, 0, 0),
                 'last_as_of_time': datetime.datetime(2010, 1, 13, 0, 0),
                 'matrix_info_end_time': datetime.datetime(2010, 1, 18, 0, 0),
                 'as_of_times': [
                     datetime.datetime(2010, 1, 9, 0, 0),
                     datetime.datetime(2010, 1, 10, 0, 0),
                     datetime.datetime(2010, 1, 11, 0, 0),
                     datetime.datetime(2010, 1, 12, 0, 0),
                     datetime.datetime(2010, 1, 13, 0, 0)
                 ],
                 'test_label_timespan': '5 days',
                 'test_as_of_date_frequency': '1 days',
                 'test_duration': '5 days'
             }]
         }
     ]
     chopper = Timechop(
         feature_start_time=datetime.datetime(1990, 1, 1, 0, 0),
         feature_end_time=datetime.datetime(2010, 1, 19, 0, 0),
         label_start_time=datetime.datetime(2010, 1, 1, 0, 0),
         label_end_time=datetime.datetime(2010, 1, 19, 0, 0),
         model_update_frequency='5 days',
         training_as_of_date_frequencies=['1 days'],
         test_as_of_date_frequencies=['1 days'],
         max_training_histories=['5 days'],
         test_durations=['5 days'],
         test_label_timespans=['5 days'],
         training_label_timespans=['5 days']
     )
     result = chopper.chop_time()
     assert(result == expected_result)
Example #10
0
 def test_bad_feature_start_time(self):
     with self.assertRaises(ValueError):
         Timechop(
             feature_start_time=datetime.datetime(2011, 1, 1, 0, 0),
             feature_end_time=datetime.datetime(2010, 1, 16, 0, 0),
             label_start_time=datetime.datetime(2010, 1, 3, 0, 0),
             label_end_time=datetime.datetime(2010, 1, 16, 0, 0),
             model_update_frequency="5 days",
             training_as_of_date_frequencies=["1 days"],
             test_as_of_date_frequencies=["1 days"],
             max_training_histories=["5 days"],
             test_durations=["5 days"],
             test_label_timespans=["1 day"],
             training_label_timespans=["1 day"],
         )
Example #11
0
    def test_no_valid_label_dates(self):
        chopper = Timechop(
            feature_start_time=datetime.datetime(2010, 1, 1, 0, 0),
            feature_end_time=datetime.datetime(2016, 1, 1, 0, 0),
            label_start_time=datetime.datetime(2015, 1, 1, 0, 0),
            label_end_time=datetime.datetime(2015, 2, 1, 0, 0),
            model_update_frequency="3 months",
            training_as_of_date_frequencies=["1 day"],
            test_as_of_date_frequencies=["1 day"],
            max_training_histories=["1 year"],
            test_durations=["6 months"],
            test_label_timespans=["1 months"],
            training_label_timespans=["3 days"],
        )

        # this should raise an error because there are no valid label dates in
        # the labeling time (label span is longer than labeling time)
        with self.assertRaises(ValueError):
            chopper.calculate_train_test_split_times(
                training_label_timespan=convert_str_to_relativedelta("3 days"),
                test_duration="6 months",
                test_label_timespan=convert_str_to_relativedelta("1 month"),
            )
Example #12
0
    def test_labels_after_features(self):
        chopper = Timechop(
            feature_start_time=datetime.datetime(2010, 1, 1, 0, 0),
            feature_end_time=datetime.datetime(2016, 1, 1, 0, 0),
            label_start_time=datetime.datetime(2015, 1, 1, 0, 0),
            label_end_time=datetime.datetime(2017, 1, 1, 0, 0),
            model_update_frequency="3 months",
            training_as_of_date_frequencies=["1 day"],
            test_as_of_date_frequencies=["1 day"],
            max_training_histories=["1 year"],
            test_durations=["6 months"],
            test_label_timespans=["1 months"],
            training_label_timespans=["3 days"],
        )

        # this should throw an exception because last possible label date is after
        # end of feature time
        with self.assertRaises(ValueError):
            chopper.calculate_train_test_split_times(
                training_label_timespan=convert_str_to_relativedelta("3 days"),
                test_duration="6 months",
                test_label_timespan=convert_str_to_relativedelta("1 month"),
            )
Example #13
0
def predict_forward_with_existed_model(db_engine, project_path, model_id,
                                       as_of_date):
    """Predict forward given model_id and as_of_date and store the prediction in database

    Args:
            db_engine (sqlalchemy.db.engine)
            project_storage (catwalk.storage.ProjectStorage)
            model_id (int) The id of a given model in the database
            as_of_date (string) a date string like "YYYY-MM-DD"
    """
    logger.spam("In PREDICT LIST................")
    upgrade_db(db_engine=db_engine)
    project_storage = ProjectStorage(project_path)
    matrix_storage_engine = project_storage.matrix_storage_engine()
    # 1. Get feature and cohort config from database
    (train_matrix_uuid,
     matrix_metadata) = train_matrix_info_from_model_id(db_engine, model_id)
    experiment_config = experiment_config_from_model_id(db_engine, model_id)

    # 2. Generate cohort
    cohort_table_name = f"triage_production.cohort_{experiment_config['cohort_config']['name']}"
    cohort_table_generator = EntityDateTableGenerator(
        db_engine=db_engine,
        query=experiment_config['cohort_config']['query'],
        entity_date_table_name=cohort_table_name)
    cohort_table_generator.generate_entity_date_table(
        as_of_dates=[dt_from_str(as_of_date)])

    # 3. Generate feature aggregations
    feature_generator = FeatureGenerator(
        db_engine=db_engine,
        features_schema_name="triage_production",
        feature_start_time=experiment_config['temporal_config']
        ['feature_start_time'],
    )
    collate_aggregations = feature_generator.aggregations(
        feature_aggregation_config=experiment_config['feature_aggregations'],
        feature_dates=[as_of_date],
        state_table=cohort_table_name)
    feature_generator.process_table_tasks(
        feature_generator.generate_all_table_tasks(collate_aggregations,
                                                   task_type='aggregation'))

    # 4. Reconstruct feature disctionary from feature_names and generate imputation

    reconstructed_feature_dict = FeatureGroup()
    imputation_table_tasks = OrderedDict()

    for aggregation in collate_aggregations:
        feature_group, feature_names = get_feature_names(
            aggregation, matrix_metadata)
        reconstructed_feature_dict[feature_group] = feature_names

        # Make sure that the features imputed in training should also be imputed in production

        features_imputed_in_train = get_feature_needs_imputation_in_train(
            aggregation, feature_names)

        features_imputed_in_production = get_feature_needs_imputation_in_production(
            aggregation, db_engine)

        total_impute_cols = set(features_imputed_in_production) | set(
            features_imputed_in_train)
        total_nonimpute_cols = set(f for f in set(feature_names)
                                   if '_imp' not in f) - total_impute_cols

        task_generator = feature_generator._generate_imp_table_tasks_for

        imputation_table_tasks.update(
            task_generator(aggregation,
                           impute_cols=list(total_impute_cols),
                           nonimpute_cols=list(total_nonimpute_cols)))
    feature_generator.process_table_tasks(imputation_table_tasks)

    # 5. Build matrix
    db_config = {
        "features_schema_name": "triage_production",
        "labels_schema_name": "public",
        "cohort_table_name": cohort_table_name,
    }

    matrix_builder = MatrixBuilder(
        db_config=db_config,
        matrix_storage_engine=matrix_storage_engine,
        engine=db_engine,
        experiment_hash=None,
        replace=True,
    )

    feature_start_time = experiment_config['temporal_config'][
        'feature_start_time']
    label_name = experiment_config['label_config']['name']
    label_type = 'binary'
    cohort_name = experiment_config['cohort_config']['name']
    user_metadata = experiment_config['user_metadata']

    # Use timechop to get the time definition for production
    temporal_config = experiment_config["temporal_config"]
    temporal_config.update(
        temporal_params_from_matrix_metadata(db_engine, model_id))
    timechopper = Timechop(**temporal_config)
    prod_definitions = timechopper.define_test_matrices(
        train_test_split_time=dt_from_str(as_of_date),
        test_duration=temporal_config['test_durations'][0],
        test_label_timespan=temporal_config['test_label_timespans'][0])

    matrix_metadata = Planner.make_metadata(
        prod_definitions[-1],
        reconstructed_feature_dict,
        label_name,
        label_type,
        cohort_name,
        'production',
        feature_start_time,
        user_metadata,
    )

    matrix_metadata['matrix_id'] = str(
        as_of_date) + f'_model_id_{model_id}' + '_risklist'

    matrix_uuid = filename_friendly_hash(matrix_metadata)

    matrix_builder.build_matrix(
        as_of_times=[as_of_date],
        label_name=label_name,
        label_type=label_type,
        feature_dictionary=reconstructed_feature_dict,
        matrix_metadata=matrix_metadata,
        matrix_uuid=matrix_uuid,
        matrix_type="production",
    )

    # 6. Predict the risk score for production
    predictor = Predictor(
        model_storage_engine=project_storage.model_storage_engine(),
        db_engine=db_engine,
        rank_order='best')

    predictor.predict(
        model_id=model_id,
        matrix_store=matrix_storage_engine.get_store(matrix_uuid),
        misc_db_parameters={},
        train_matrix_columns=matrix_storage_engine.get_store(
            train_matrix_uuid).columns())
Example #14
0
    def predict(self, prediction_date):
        """Predict forward by creating a matrix using as_of_date = prediction_date and applying the retrain model on it

        Args:
            prediction_date(str)
        """
        cohort_table_name = f"triage_production.cohort_{self.experiment_config['cohort_config']['name']}_predict"

        # 1. Generate cohort
        self.generate_entity_date_table(prediction_date, cohort_table_name)

        # 2. Generate feature aggregations
        collate_aggregations = self.get_collate_aggregations(
            prediction_date, cohort_table_name)
        self.feature_generator.process_table_tasks(
            self.feature_generator.generate_all_table_tasks(
                collate_aggregations, task_type='aggregation'))
        # 3. Reconstruct feature disctionary from feature_names and generate imputation
        reconstructed_feature_dict, imputation_table_tasks = self.get_feature_dict_and_imputation_task(
            collate_aggregations, self.retrain_model_id)
        self.feature_generator.process_table_tasks(imputation_table_tasks)

        # 4. Build matrix
        db_config = {
            "features_schema_name": "triage_production",
            "labels_schema_name": "public",
            "cohort_table_name": cohort_table_name,
        }

        matrix_builder = MatrixBuilder(
            db_config=db_config,
            matrix_storage_engine=self.matrix_storage_engine,
            engine=self.db_engine,
            experiment_hash=None,
            replace=True,
        )
        # Use timechop to get the time definition for production
        temporal_config = self.get_temporal_config_for_retrain(
            dt_from_str(prediction_date))
        timechopper = Timechop(**temporal_config)

        retrain_config = get_retrain_config_from_model_id(
            self.db_engine, self.retrain_model_id)

        prod_definitions = timechopper.define_test_matrices(
            train_test_split_time=dt_from_str(prediction_date),
            test_duration=retrain_config['test_duration'],
            test_label_timespan=retrain_config['test_label_timespan'])
        last_split_definition = prod_definitions[-1]
        matrix_metadata = Planner.make_metadata(
            matrix_definition=last_split_definition,
            feature_dictionary=reconstructed_feature_dict,
            label_name=self.label_name,
            label_type='binary',
            cohort_name=self.cohort_name,
            matrix_type='production',
            feature_start_time=self.feature_start_time,
            user_metadata=self.user_metadata,
        )

        matrix_metadata['matrix_id'] = str(
            prediction_date
        ) + f'_model_id_{self.retrain_model_id}' + '_risklist'

        matrix_uuid = filename_friendly_hash(matrix_metadata)

        matrix_builder.build_matrix(
            as_of_times=[prediction_date],
            label_name=self.label_name,
            label_type='binary',
            feature_dictionary=reconstructed_feature_dict,
            matrix_metadata=matrix_metadata,
            matrix_uuid=matrix_uuid,
            matrix_type="production",
        )

        # 5. Predict the risk score for production
        predictor = Predictor(
            model_storage_engine=self.project_storage.model_storage_engine(),
            db_engine=self.db_engine,
            rank_order='best')

        predictor.predict(
            model_id=self.retrain_model_id,
            matrix_store=self.matrix_storage_engine.get_store(matrix_uuid),
            misc_db_parameters={},
            train_matrix_columns=self.matrix_storage_engine.get_store(
                self.retrain_matrix_uuid).columns(),
        )
        self.predict_matrix_uuid = matrix_uuid
Example #15
0
def basic_integration_test(state_filters, feature_group_create_rules,
                           feature_group_mix_rules,
                           expected_matrix_multiplier):
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        Base.metadata.create_all(db_engine)
        populate_source_data(db_engine)

        with TemporaryDirectory() as temp_dir:
            chopper = Timechop(
                feature_start_time=datetime(2010, 1, 1),
                feature_end_time=datetime(2014, 1, 1),
                label_start_time=datetime(2011, 1, 1),
                label_end_time=datetime(2014, 1, 1),
                model_update_frequency='1year',
                training_label_timespans=['6months'],
                test_label_timespans=['6months'],
                training_as_of_date_frequencies='1day',
                test_as_of_date_frequencies='3months',
                max_training_histories=['1months'],
                test_durations=['1months'],
            )

            state_table_generator = StateTableGenerator(
                db_engine=db_engine,
                experiment_hash='abcd',
                dense_state_table='states',
            )

            label_generator = BinaryLabelGenerator(db_engine=db_engine,
                                                   events_table='events')

            feature_generator = FeatureGenerator(
                db_engine=db_engine,
                features_schema_name='features',
                replace=True,
            )

            feature_dictionary_creator = FeatureDictionaryCreator(
                db_engine=db_engine, features_schema_name='features')

            feature_group_creator = FeatureGroupCreator(
                feature_group_create_rules)

            feature_group_mixer = FeatureGroupMixer(feature_group_mix_rules)

            planner = Planner(engine=db_engine,
                              feature_start_time=datetime(2010, 1, 1),
                              label_names=['outcome'],
                              label_types=['binary'],
                              db_config={
                                  'features_schema_name':
                                  'features',
                                  'labels_schema_name':
                                  'public',
                                  'labels_table_name':
                                  'labels',
                                  'sparse_state_table_name':
                                  'tmp_sparse_states_abcd',
                              },
                              matrix_directory=os.path.join(
                                  temp_dir, 'matrices'),
                              states=state_filters,
                              user_metadata={},
                              replace=True)

            # chop time
            split_definitions = chopper.chop_time()
            num_split_matrices = sum(1 + len(split['test_matrices'])
                                     for split in split_definitions)

            # generate as_of_times for feature/label/state generation
            all_as_of_times = []
            for split in split_definitions:
                all_as_of_times.extend(split['train_matrix']['as_of_times'])
                for test_matrix in split['test_matrices']:
                    all_as_of_times.extend(test_matrix['as_of_times'])
            all_as_of_times = list(set(all_as_of_times))

            feature_aggregation_config = [{
                'prefix':
                'cat',
                'from_obj':
                'cat_complaints',
                'knowledge_date_column':
                'as_of_date',
                'aggregates': [{
                    'quantity': 'cat_sightings',
                    'metrics': ['count', 'avg'],
                    'imputation': {
                        'all': {
                            'type': 'mean'
                        }
                    }
                }],
                'intervals': ['1y'],
                'groups': ['entity_id']
            }, {
                'prefix':
                'dog',
                'from_obj':
                'dog_complaints',
                'knowledge_date_column':
                'as_of_date',
                'aggregates_imputation': {
                    'count': {
                        'type': 'constant',
                        'value': 7
                    },
                    'sum': {
                        'type': 'mean'
                    },
                    'avg': {
                        'type': 'zero'
                    }
                },
                'aggregates': [{
                    'quantity': 'dog_sightings',
                    'metrics': ['count', 'avg'],
                }],
                'intervals': ['1y'],
                'groups': ['entity_id']
            }]

            state_table_generator.validate()
            label_generator.validate()
            feature_generator.validate(feature_aggregation_config)
            feature_group_creator.validate()
            planner.validate()

            # generate sparse state table
            state_table_generator.generate_sparse_table(
                as_of_dates=all_as_of_times)

            # create labels table
            label_generator.generate_all_labels(labels_table='labels',
                                                as_of_dates=all_as_of_times,
                                                label_timespans=['6months'])

            # create feature table tasks
            # we would use FeatureGenerator#create_all_tables but want to use
            # the tasks dict directly to create a feature dict
            aggregations = feature_generator.aggregations(
                feature_aggregation_config=[{
                    'prefix':
                    'cat',
                    'from_obj':
                    'cat_complaints',
                    'knowledge_date_column':
                    'as_of_date',
                    'aggregates': [{
                        'quantity': 'cat_sightings',
                        'metrics': ['count', 'avg'],
                        'imputation': {
                            'all': {
                                'type': 'mean'
                            }
                        }
                    }],
                    'intervals': ['1y'],
                    'groups': ['entity_id']
                }, {
                    'prefix':
                    'dog',
                    'from_obj':
                    'dog_complaints',
                    'knowledge_date_column':
                    'as_of_date',
                    'aggregates_imputation': {
                        'count': {
                            'type': 'constant',
                            'value': 7
                        },
                        'sum': {
                            'type': 'mean'
                        },
                        'avg': {
                            'type': 'zero'
                        }
                    },
                    'aggregates': [{
                        'quantity': 'dog_sightings',
                        'metrics': ['count', 'avg'],
                    }],
                    'intervals': ['1y'],
                    'groups': ['entity_id']
                }],
                feature_dates=all_as_of_times,
                state_table=state_table_generator.sparse_table_name)
            feature_table_agg_tasks = feature_generator.generate_all_table_tasks(
                aggregations, task_type='aggregation')

            # create feature aggregation tables
            feature_generator.process_table_tasks(feature_table_agg_tasks)

            feature_table_imp_tasks = feature_generator.generate_all_table_tasks(
                aggregations, task_type='imputation')

            # create feature imputation tables
            feature_generator.process_table_tasks(feature_table_imp_tasks)

            # build feature dictionaries from feature tables and
            # subsetting config
            master_feature_dict = feature_dictionary_creator.feature_dictionary(
                feature_table_names=feature_table_imp_tasks.keys(),
                index_column_lookup=feature_generator.index_column_lookup(
                    aggregations))

            feature_dicts = feature_group_mixer.generate(
                feature_group_creator.subsets(master_feature_dict))

            # figure out what matrices need to be built
            _, matrix_build_tasks =\
                planner.generate_plans(
                    split_definitions,
                    feature_dicts
                )

            # go and build the matrices
            planner.build_all_matrices(matrix_build_tasks)

            # super basic assertion: did matrices we expect get created?
            matrix_directory = os.path.join(temp_dir, 'matrices')
            matrices = [
                path for path in os.listdir(matrix_directory) if '.csv' in path
            ]
            metadatas = [
                path for path in os.listdir(matrix_directory)
                if '.yaml' in path
            ]
            assert len(
                matrices) == num_split_matrices * expected_matrix_multiplier
            assert len(
                metadatas) == num_split_matrices * expected_matrix_multiplier
Example #16
0
    def _run(self, temporal_config):
        def dt_from_str(dt_str):
            return datetime.strptime(dt_str, "%Y-%m-%d")

        splits = []
        try:
            chopper = Timechop(
                feature_start_time=dt_from_str(
                    temporal_config["feature_start_time"]),
                feature_end_time=dt_from_str(
                    temporal_config["feature_end_time"]),
                label_start_time=dt_from_str(
                    temporal_config["label_start_time"]),
                label_end_time=dt_from_str(temporal_config["label_end_time"]),
                model_update_frequency=temporal_config[
                    "model_update_frequency"],
                training_label_timespans=temporal_config[
                    "training_label_timespans"],
                test_label_timespans=temporal_config["test_label_timespans"],
                training_as_of_date_frequencies=temporal_config[
                    "training_as_of_date_frequencies"],
                test_as_of_date_frequencies=temporal_config[
                    "test_as_of_date_frequencies"],
                max_training_histories=temporal_config[
                    "max_training_histories"],
                test_durations=temporal_config["test_durations"],
            )
            splits = chopper.chop_time()
        except Exception as e:
            raise ValueError(
                dedent("""
            Section: temporal_config -
            Timechop could not produce temporal splits from config {}.
            Error: {}
            """.format(temporal_config, e)))
        for split_num, split in enumerate(splits):
            if len(split["train_matrix"]["as_of_times"]) == 0:
                raise ValueError(
                    dedent("""
                Section: temporal_config -
                Computed split {} has a train matrix with no as_of_times.
                """.format(split)))

            # timechop computes the last time available to train data
            # and stores it in the matrix as 'matrix_info_end_time'
            # but to be more sure, let's double-check by comparing as_of_times
            # in the train and all associated test matrices
            train_max_data_time = max(
                split["train_matrix"]
                ["as_of_times"]) + convert_str_to_relativedelta(
                    split["train_matrix"]["training_label_timespan"])

            for test_matrix in split["test_matrices"]:
                if len(test_matrix["as_of_times"]) == 0:
                    raise ValueError(
                        dedent("""
                    Section: temporal_config -
                    Computed split {} has a test matrix with no as_of_times.
                    """.format(split)))
                overlapping_times = [
                    as_of_time for as_of_time in test_matrix["as_of_times"]
                    if as_of_time < train_max_data_time
                ]
                if overlapping_times:
                    raise ValueError(
                        dedent("""
                    Section: temporal_config -
                    Computed split index {} has a test matrix with as_of_times {}
                    < the maximum train as_of_time + train label timespan.
                    ({}). This is likely an error in timechop. See the
                    experiment's split_definitions[{}] for more information""".
                               format(
                                   split_num,
                                   overlapping_times,
                                   train_max_data_time,
                                   split_num,
                               )))
Example #17
0
    def initialize_components(self):
        split_config = self.config["temporal_config"]

        self.chopper = Timechop(**split_config)

        cohort_config = self.config.get("cohort_config", {})
        if "query" in cohort_config:
            self.cohort_table_generator = CohortTableGenerator(
                cohort_table_name=self.cohort_table_name,
                db_engine=self.db_engine,
                query=cohort_config["query"],
                replace=self.replace
            )
        else:
            logging.warning(
                "cohort_config missing or unrecognized. Without a cohort, "
                "you will not be able to make matrices or perform feature imputation."
            )
            self.cohort_table_generator = CohortTableGeneratorNoOp()

        if "label_config" in self.config:
            self.label_generator = LabelGenerator(
                label_name=self.config["label_config"].get("name", None),
                query=self.config["label_config"]["query"],
                replace=self.replace,
                db_engine=self.db_engine,
            )
        else:
            self.label_generator = LabelGeneratorNoOp()
            logging.warning(
                "label_config missing or unrecognized. Without labels, "
                "you will not be able to make matrices."
            )

        self.feature_dictionary_creator = FeatureDictionaryCreator(
            features_schema_name=self.features_schema_name, db_engine=self.db_engine
        )

        self.feature_generator = FeatureGenerator(
            features_schema_name=self.features_schema_name,
            replace=self.replace,
            db_engine=self.db_engine,
            feature_start_time=split_config["feature_start_time"],
            materialize_subquery_fromobjs=self.materialize_subquery_fromobjs
        )

        self.feature_group_creator = FeatureGroupCreator(
            self.config.get("feature_group_definition", {"all": [True]})
        )

        self.feature_group_mixer = FeatureGroupMixer(
            self.config.get("feature_group_strategies", ["all"])
        )

        self.planner = Planner(
            feature_start_time=dt_from_str(split_config["feature_start_time"]),
            label_names=[
                self.config.get("label_config", {}).get("name", DEFAULT_LABEL_NAME)
            ],
            label_types=["binary"],
            cohort_names=[self.config.get("cohort_config", {}).get("name", None)],
            user_metadata=self.config.get("user_metadata", {}),
        )

        self.matrix_builder = MatrixBuilder(
            db_config={
                "features_schema_name": self.features_schema_name,
                "labels_schema_name": "public",
                "labels_table_name": self.labels_table_name,
                "cohort_table_name": self.cohort_table_name,
            },
            matrix_storage_engine=self.matrix_storage_engine,
            experiment_hash=self.experiment_hash,
            include_missing_labels_in_train_as=self.config.get("label_config", {}).get(
                "include_missing_labels_in_train_as", None
            ),
            engine=self.db_engine,
            replace=self.replace,
        )

        self.trainer = ModelTrainer(
            experiment_hash=self.experiment_hash,
            model_storage_engine=self.model_storage_engine,
            model_grouper=ModelGrouper(self.config.get("model_group_keys", [])),
            db_engine=self.db_engine,
            replace=self.replace,
        )

        self.tester = ModelTester(
            model_storage_engine=self.model_storage_engine,
            matrix_storage_engine=self.matrix_storage_engine,
            replace=self.replace,
            db_engine=self.db_engine,
            individual_importance_config=self.config.get("individual_importance", {}),
            evaluator_config=self.config.get("scoring", {}),
        )
Example #18
0
# coding: utf-8

from triage.component.timechop.plotting import visualize_chops
from triage.component.timechop import Timechop

import yaml

import matplotlib.pyplot as plt
import numpy as np

if __name__ == '__main__':
    with open('simple_config.yaml') as f:
        experiment_config = yaml.load(f)

    chopper = Timechop(**(experiment_config["temporal_config"]))
    visualize_chops(chopper)
Example #19
0
 def test_look_back_time_before_modeling_start(self):
     expected_result = {
         'feature_start_time': datetime.datetime(1990, 1, 1, 0, 0),
         'label_start_time': datetime.datetime(2010, 1, 1, 0, 0),
         'feature_end_time': datetime.datetime(2010, 1, 11, 0, 0),
         'label_end_time': datetime.datetime(2010, 1, 11, 0, 0),
         'train_matrix': {
             'first_as_of_time': datetime.datetime(2010, 1, 1, 0, 0),
             'last_as_of_time': datetime.datetime(2010, 1, 5, 0, 0),
             'matrix_info_end_time': datetime.datetime(2010, 1, 6, 0, 0),
             'as_of_times': [
                 datetime.datetime(2010, 1, 1, 0, 0),
                 datetime.datetime(2010, 1, 2, 0, 0),
                 datetime.datetime(2010, 1, 3, 0, 0),
                 datetime.datetime(2010, 1, 4, 0, 0),
                 datetime.datetime(2010, 1, 5, 0, 0)
             ],
             'training_label_timespan': '1 day',
             'training_as_of_date_frequency': '1 days',
             'max_training_history': '10 days'
         },
         'test_matrices': [
             {
                 'first_as_of_time': datetime.datetime(2010, 1, 6, 0, 0),
                 'last_as_of_time': datetime.datetime(2010, 1, 9, 0, 0),
                 'matrix_info_end_time': datetime.datetime(2010, 1, 10, 0, 0),
                 'as_of_times': [
                     datetime.datetime(2010, 1, 6, 0, 0),
                     datetime.datetime(2010, 1, 9, 0, 0)
                 ],
                 'test_label_timespan': '1 day',
                 'test_as_of_date_frequency': '3 days',
                 'test_duration': '5 days'
             },
             {
                 'first_as_of_time': datetime.datetime(2010, 1, 6, 0, 0),
                 'last_as_of_time': datetime.datetime(2010, 1, 6, 0, 0),
                 'matrix_info_end_time': datetime.datetime(2010, 1, 7, 0, 0),
                 'as_of_times': [
                     datetime.datetime(2010, 1, 6, 0, 0),
                 ],
                 'test_label_timespan': '1 day',
                 'test_as_of_date_frequency': '6 days',
                 'test_duration': '5 days'
             }
         ]
     }
     chopper = Timechop(
         feature_start_time=datetime.datetime(1990, 1, 1, 0, 0),
         feature_end_time=datetime.datetime(2010, 1, 11, 0, 0),
         label_start_time=datetime.datetime(2010, 1, 1, 0, 0),
         label_end_time=datetime.datetime(2010, 1, 11, 0, 0),
         model_update_frequency='5 days',
         training_as_of_date_frequencies=['1 days'],
         test_as_of_date_frequencies=['3 days', '6 days'],
         max_training_histories=['10 days'],
         test_durations=['5 days'],
         test_label_timespans=['1 day'],
         training_label_timespans=['1 day']
     )
     result = chopper.generate_matrix_definitions(
         train_test_split_time=datetime.datetime(2010, 1, 6, 0, 0),
         training_as_of_date_frequency='1 days',
         max_training_history='10 days',
         test_duration='5 days',
         test_label_timespan='1 day',
         training_label_timespan='1 day'
     )
     assert result == expected_result
Example #20
0
    host=dbconfig['host'],
    username=dbconfig['user'],
    database=dbconfig['db'],
    password=dbconfig['pass'],
    port=dbconfig['port'],
)

db_engine = create_engine(db_url)

# loading config file

with open('donors-choose-config.yaml', 'r') as fin:
    config = yaml.load(fin)

# generating temporal config plot
chopper = Timechop(**config['temporal_config'])

# We aren't interested in seeing the entire feature_start_time represented
# in our timechop plot. That would hide the interesting information. So we
# set it to equal label_start_time for the plot.

chopper.feature_start_time = chopper.label_start_time

visualize_chops(chopper, save_target='triage_output/timechop.png')

# creating experiment object

experiment = MultiCoreExperiment(
    config=config,
    db_engine=db_engine,
    project_path='s3://dsapp-education-migrated/donors-choose',
Example #21
0
    def initialize_components(self):
        split_config = self.config['temporal_config']

        self.chopper = Timechop(
            feature_start_time=dt_from_str(split_config['feature_start_time']),
            feature_end_time=dt_from_str(split_config['feature_end_time']),
            label_start_time=dt_from_str(split_config['label_start_time']),
            label_end_time=dt_from_str(split_config['label_end_time']),
            model_update_frequency=split_config['model_update_frequency'],
            training_label_timespans=split_config['training_label_timespans'],
            test_label_timespans=split_config['test_label_timespans'],
            training_as_of_date_frequencies=split_config[
                'training_as_of_date_frequencies'],
            test_as_of_date_frequencies=split_config[
                'test_as_of_date_frequencies'],
            max_training_histories=split_config['max_training_histories'],
            test_durations=split_config['test_durations'],
        )

        cohort_config = self.config.get('cohort_config', {})
        if 'query' in cohort_config:
            self.state_table_generator = StateTableGeneratorFromQuery(
                experiment_hash=self.experiment_hash,
                db_engine=self.db_engine,
                query=cohort_config['query'])
        elif 'entities_table' in cohort_config:
            self.state_table_generator = StateTableGeneratorFromEntities(
                experiment_hash=self.experiment_hash,
                db_engine=self.db_engine,
                entities_table=cohort_config['entities_table'])
        elif 'dense_states' in cohort_config:
            self.state_table_generator = StateTableGeneratorFromDense(
                experiment_hash=self.experiment_hash,
                db_engine=self.db_engine,
                dense_state_table=cohort_config['dense_states']['table_name'])
        else:
            raise ValueError('Cohort config missing or unrecognized')

        self.label_generator = LabelGenerator(
            label_name=self.config['label_config'].get('name', None),
            query=self.config['label_config']['query'],
            db_engine=self.db_engine,
        )

        self.feature_dictionary_creator = FeatureDictionaryCreator(
            features_schema_name=self.features_schema_name,
            db_engine=self.db_engine,
        )

        self.feature_generator = FeatureGenerator(
            features_schema_name=self.features_schema_name,
            replace=self.replace,
            db_engine=self.db_engine,
            feature_start_time=split_config['feature_start_time'])

        self.feature_group_creator = FeatureGroupCreator(
            self.config.get('feature_group_definition', {'all': [True]}))

        self.feature_group_mixer = FeatureGroupMixer(
            self.config.get('feature_group_strategies', ['all']))

        self.planner = Planner(
            feature_start_time=dt_from_str(split_config['feature_start_time']),
            label_names=[
                self.config.get('label_config',
                                {}).get('name', DEFAULT_LABEL_NAME)
            ],
            label_types=['binary'],
            matrix_directory=self.matrices_directory,
            cohort_name=self.config.get('cohort_config', {}).get('name', None),
            states=self.config.get('cohort_config',
                                   {}).get('dense_states',
                                           {}).get('state_filters', []),
            user_metadata=self.config.get('user_metadata', {}),
        )

        self.matrix_builder = HighMemoryCSVBuilder(
            db_config={
                'features_schema_name':
                self.features_schema_name,
                'labels_schema_name':
                'public',
                'labels_table_name':
                self.labels_table_name,
                # TODO: have planner/builder take state table later on, so we
                # can grab it from the StateTableGenerator instead of
                # duplicating it here
                'sparse_state_table_name':
                'tmp_sparse_states_{}'.format(self.experiment_hash),
            },
            matrix_directory=self.matrices_directory,
            include_missing_labels_in_train_as=self.config['label_config'].get(
                'include_missing_labels_in_train_as', None),
            engine=self.db_engine,
            replace=self.replace)

        self.trainer = ModelTrainer(
            project_path=self.project_path,
            experiment_hash=self.experiment_hash,
            model_storage_engine=self.model_storage_engine,
            model_grouper=ModelGrouper(self.config.get('model_group_keys',
                                                       [])),
            db_engine=self.db_engine,
            replace=self.replace)

        self.tester = ModelTester(
            model_storage_engine=self.model_storage_engine,
            project_path=self.project_path,
            replace=self.replace,
            db_engine=self.db_engine,
            individual_importance_config=self.config.get(
                'individual_importance', {}),
            evaluator_config=self.config.get('scoring', {}))
Example #22
0
 def test_look_back_time_equal_modeling_start(self):
     # TODO: rework this test since the test label window of 3 months
     # cannot be satisfied by the 10 day difference between modeling
     # start and end times, so it's not a very realistic case
     expected_result = {
         "feature_start_time": datetime.datetime(1990, 1, 1, 0, 0),
         "label_start_time": datetime.datetime(2010, 1, 1, 0, 0),
         "feature_end_time": datetime.datetime(2010, 1, 11, 0, 0),
         "label_end_time": datetime.datetime(2010, 1, 11, 0, 0),
         "train_matrix": {
             "first_as_of_time": datetime.datetime(2010, 1, 1, 0, 0),
             "last_as_of_time": datetime.datetime(2010, 1, 5, 0, 0),
             "matrix_info_end_time": datetime.datetime(2010, 1, 6, 0, 0),
             "as_of_times": [
                 datetime.datetime(2010, 1, 1, 0, 0),
                 datetime.datetime(2010, 1, 2, 0, 0),
                 datetime.datetime(2010, 1, 3, 0, 0),
                 datetime.datetime(2010, 1, 4, 0, 0),
                 datetime.datetime(2010, 1, 5, 0, 0),
             ],
             "training_label_timespan": "1 day",
             "training_as_of_date_frequency": "1 days",
             "max_training_history": "5 days",
         },
         "test_matrices": [
             {
                 "first_as_of_time": datetime.datetime(2010, 1, 6, 0, 0),
                 "last_as_of_time": datetime.datetime(2010, 1, 9, 0, 0),
                 "matrix_info_end_time": datetime.datetime(2010, 1, 10, 0, 0),
                 "as_of_times": [
                     datetime.datetime(2010, 1, 6, 0, 0),
                     datetime.datetime(2010, 1, 9, 0, 0),
                 ],
                 "test_label_timespan": "1 day",
                 "test_as_of_date_frequency": "3 days",
                 "test_duration": "5 days",
             }
         ],
     }
     chopper = Timechop(
         feature_start_time=datetime.datetime(1990, 1, 1, 0, 0),
         feature_end_time=datetime.datetime(2010, 1, 11, 0, 0),
         label_start_time=datetime.datetime(2010, 1, 1, 0, 0),
         label_end_time=datetime.datetime(2010, 1, 11, 0, 0),
         model_update_frequency="5 days",
         training_as_of_date_frequencies=["1 days"],
         test_as_of_date_frequencies=["3 days"],
         max_training_histories=["5 days"],
         test_durations=["5 days"],
         test_label_timespans=["1 day"],
         training_label_timespans=["1 day"],
     )
     result = chopper.generate_matrix_definitions(
         train_test_split_time=datetime.datetime(2010, 1, 6, 0, 0),
         training_as_of_date_frequency="1 days",
         max_training_history="5 days",
         test_duration="5 days",
         test_label_timespan="1 day",
         training_label_timespan="1 day",
     )
     assert result == expected_result
Example #23
0
    def initialize_components(self):
        split_config = self.config['temporal_config']

        self.chopper = Timechop(**split_config)

        cohort_config = self.config.get('cohort_config', {})
        if 'query' in cohort_config:
            self.state_table_generator = StateTableGeneratorFromQuery(
                experiment_hash=self.experiment_hash,
                db_engine=self.db_engine,
                query=cohort_config['query']
            )
        elif 'entities_table' in cohort_config:
            self.state_table_generator = StateTableGeneratorFromEntities(
                experiment_hash=self.experiment_hash,
                db_engine=self.db_engine,
                entities_table=cohort_config['entities_table']
            )
        elif 'dense_states' in cohort_config:
            self.state_table_generator = StateTableGeneratorFromDense(
                experiment_hash=self.experiment_hash,
                db_engine=self.db_engine,
                dense_state_table=cohort_config['dense_states']['table_name']
            )
        else:
            logging.warning('cohort_config missing or unrecognized. Without a cohort, you will not be able to make matrices or perform feature imputation.')
            self.state_table_generator = StateTableGeneratorNoOp()

        if 'label_config' in self.config:
            self.label_generator = LabelGenerator(
                label_name=self.config['label_config'].get('name', None),
                query=self.config['label_config']['query'],
                db_engine=self.db_engine,
            )
        else:
            self.label_generator = LabelGeneratorNoOp()
            logging.warning('label_config missing or unrecognized. Without labels, you will not be able to make matrices.')

        self.feature_dictionary_creator = FeatureDictionaryCreator(
            features_schema_name=self.features_schema_name,
            db_engine=self.db_engine,
        )

        self.feature_generator = FeatureGenerator(
            features_schema_name=self.features_schema_name,
            replace=self.replace,
            db_engine=self.db_engine,
            feature_start_time=split_config['feature_start_time']
        )

        self.feature_group_creator = FeatureGroupCreator(
            self.config.get('feature_group_definition', {'all': [True]})
        )

        self.feature_group_mixer = FeatureGroupMixer(
            self.config.get('feature_group_strategies', ['all'])
        )

        self.planner = Planner(
            feature_start_time=dt_from_str(split_config['feature_start_time']),
            label_names=[self.config.get('label_config', {}).get('name', DEFAULT_LABEL_NAME)],
            label_types=['binary'],
            cohort_name=self.config.get('cohort_config', {}).get('name', None),
            states=self.config.get('cohort_config', {}).get('dense_states', {})
            .get('state_filters', []),
            user_metadata=self.config.get('user_metadata', {}),
        )

        self.matrix_builder = MatrixBuilder(
            db_config={
                'features_schema_name': self.features_schema_name,
                'labels_schema_name': 'public',
                'labels_table_name': self.labels_table_name,
                # TODO: have planner/builder take state table later on, so we
                # can grab it from the StateTableGenerator instead of
                # duplicating it here
                'sparse_state_table_name': self.sparse_states_table_name,
            },
            matrix_storage_engine=self.matrix_storage_engine,
            include_missing_labels_in_train_as=self.config.get('label_config', {})
            .get('include_missing_labels_in_train_as', None),
            engine=self.db_engine,
            replace=self.replace
        )

        self.trainer = ModelTrainer(
            experiment_hash=self.experiment_hash,
            model_storage_engine=self.model_storage_engine,
            model_grouper=ModelGrouper(self.config.get('model_group_keys', [])),
            db_engine=self.db_engine,
            replace=self.replace
        )

        self.tester = ModelTester(
            model_storage_engine=self.model_storage_engine,
            matrix_storage_engine=self.matrix_storage_engine,
            replace=self.replace,
            db_engine=self.db_engine,
            individual_importance_config=self.config.get('individual_importance', {}),
            evaluator_config=self.config.get('scoring', {})
        )
Example #24
0
 def test_look_back_time_before_modeling_start(self):
     expected_result = {
         "feature_start_time": datetime.datetime(1990, 1, 1, 0, 0),
         "label_start_time": datetime.datetime(2010, 1, 1, 0, 0),
         "feature_end_time": datetime.datetime(2010, 1, 11, 0, 0),
         "label_end_time": datetime.datetime(2010, 1, 11, 0, 0),
         "train_matrix": {
             "first_as_of_time": datetime.datetime(2010, 1, 1, 0, 0),
             "last_as_of_time": datetime.datetime(2010, 1, 5, 0, 0),
             "matrix_info_end_time": datetime.datetime(2010, 1, 6, 0, 0),
             "as_of_times": [
                 datetime.datetime(2010, 1, 1, 0, 0),
                 datetime.datetime(2010, 1, 2, 0, 0),
                 datetime.datetime(2010, 1, 3, 0, 0),
                 datetime.datetime(2010, 1, 4, 0, 0),
                 datetime.datetime(2010, 1, 5, 0, 0),
             ],
             "training_label_timespan": "1 day",
             "training_as_of_date_frequency": "1 days",
             "max_training_history": "10 days",
         },
         "test_matrices": [
             {
                 "first_as_of_time": datetime.datetime(2010, 1, 6, 0, 0),
                 "last_as_of_time": datetime.datetime(2010, 1, 9, 0, 0),
                 "matrix_info_end_time": datetime.datetime(2010, 1, 10, 0, 0),
                 "as_of_times": [
                     datetime.datetime(2010, 1, 6, 0, 0),
                     datetime.datetime(2010, 1, 9, 0, 0),
                 ],
                 "test_label_timespan": "1 day",
                 "test_as_of_date_frequency": "3 days",
                 "test_duration": "5 days",
             },
             {
                 "first_as_of_time": datetime.datetime(2010, 1, 6, 0, 0),
                 "last_as_of_time": datetime.datetime(2010, 1, 6, 0, 0),
                 "matrix_info_end_time": datetime.datetime(2010, 1, 7, 0, 0),
                 "as_of_times": [datetime.datetime(2010, 1, 6, 0, 0)],
                 "test_label_timespan": "1 day",
                 "test_as_of_date_frequency": "6 days",
                 "test_duration": "5 days",
             },
         ],
     }
     chopper = Timechop(
         feature_start_time=datetime.datetime(1990, 1, 1, 0, 0),
         feature_end_time=datetime.datetime(2010, 1, 11, 0, 0),
         label_start_time=datetime.datetime(2010, 1, 1, 0, 0),
         label_end_time=datetime.datetime(2010, 1, 11, 0, 0),
         model_update_frequency="5 days",
         training_as_of_date_frequencies=["1 days"],
         test_as_of_date_frequencies=["3 days", "6 days"],
         max_training_histories=["10 days"],
         test_durations=["5 days"],
         test_label_timespans=["1 day"],
         training_label_timespans=["1 day"],
     )
     result = chopper.generate_matrix_definitions(
         train_test_split_time=datetime.datetime(2010, 1, 6, 0, 0),
         training_as_of_date_frequency="1 days",
         max_training_history="10 days",
         test_duration="5 days",
         test_label_timespan="1 day",
         training_label_timespan="1 day",
     )
     assert result == expected_result
Example #25
0
def basic_integration_test(
    cohort_names,
    feature_group_create_rules,
    feature_group_mix_rules,
    expected_matrix_multiplier,
    expected_group_lists,
):
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        Base.metadata.create_all(db_engine)
        populate_source_data(db_engine)

        with TemporaryDirectory() as temp_dir:
            chopper = Timechop(
                feature_start_time=datetime(2010, 1, 1),
                feature_end_time=datetime(2014, 1, 1),
                label_start_time=datetime(2011, 1, 1),
                label_end_time=datetime(2014, 1, 1),
                model_update_frequency="1year",
                training_label_timespans=["6months"],
                test_label_timespans=["6months"],
                training_as_of_date_frequencies="1day",
                test_as_of_date_frequencies="3months",
                max_training_histories=["1months"],
                test_durations=["1months"],
            )

            entity_date_table_generator = EntityDateTableGenerator(
                db_engine=db_engine,
                entity_date_table_name="cohort_abcd",
                query="select distinct(entity_id) from events")

            label_generator = LabelGenerator(
                db_engine=db_engine,
                query=sample_config()["label_config"]["query"])

            feature_generator = FeatureGenerator(
                db_engine=db_engine,
                features_schema_name="features",
                replace=True)

            feature_dictionary_creator = FeatureDictionaryCreator(
                db_engine=db_engine, features_schema_name="features")

            feature_group_creator = FeatureGroupCreator(
                feature_group_create_rules)

            feature_group_mixer = FeatureGroupMixer(feature_group_mix_rules)
            project_storage = ProjectStorage(temp_dir)
            planner = Planner(
                feature_start_time=datetime(2010, 1, 1),
                label_names=["outcome"],
                label_types=["binary"],
                cohort_names=cohort_names,
                user_metadata={},
            )

            builder = MatrixBuilder(
                engine=db_engine,
                db_config={
                    "features_schema_name": "features",
                    "labels_schema_name": "public",
                    "labels_table_name": "labels",
                    "cohort_table_name": "cohort_abcd",
                },
                experiment_hash=None,
                matrix_storage_engine=project_storage.matrix_storage_engine(),
                replace=True,
            )

            # chop time
            split_definitions = chopper.chop_time()
            num_split_matrices = sum(1 + len(split["test_matrices"])
                                     for split in split_definitions)

            # generate as_of_times for feature/label/state generation
            all_as_of_times = []
            for split in split_definitions:
                all_as_of_times.extend(split["train_matrix"]["as_of_times"])
                for test_matrix in split["test_matrices"]:
                    all_as_of_times.extend(test_matrix["as_of_times"])
            all_as_of_times = list(set(all_as_of_times))

            # generate entity_date state table
            entity_date_table_generator.generate_entity_date_table(
                as_of_dates=all_as_of_times)

            # create labels table
            label_generator.generate_all_labels(
                labels_table="labels",
                as_of_dates=all_as_of_times,
                label_timespans=["6months"],
            )

            # create feature table tasks
            # we would use FeatureGenerator#create_all_tables but want to use
            # the tasks dict directly to create a feature dict
            aggregations = feature_generator.aggregations(
                feature_aggregation_config=[
                    {
                        "prefix":
                        "cat",
                        "from_obj":
                        "cat_complaints",
                        "knowledge_date_column":
                        "as_of_date",
                        "aggregates": [{
                            "quantity": "cat_sightings",
                            "metrics": ["count", "avg"],
                            "imputation": {
                                "all": {
                                    "type": "mean"
                                }
                            },
                        }],
                        "intervals": ["1y"],
                        "groups": ["entity_id"],
                    },
                    {
                        "prefix":
                        "dog",
                        "from_obj":
                        "dog_complaints",
                        "knowledge_date_column":
                        "as_of_date",
                        "aggregates_imputation": {
                            "count": {
                                "type": "constant",
                                "value": 7
                            },
                            "sum": {
                                "type": "mean"
                            },
                            "avg": {
                                "type": "zero"
                            },
                        },
                        "aggregates": [{
                            "quantity": "dog_sightings",
                            "metrics": ["count", "avg"]
                        }],
                        "intervals": ["1y"],
                        "groups": ["entity_id"],
                    },
                ],
                feature_dates=all_as_of_times,
                state_table=entity_date_table_generator.entity_date_table_name,
            )
            feature_table_agg_tasks = feature_generator.generate_all_table_tasks(
                aggregations, task_type="aggregation")

            # create feature aggregation tables
            feature_generator.process_table_tasks(feature_table_agg_tasks)

            feature_table_imp_tasks = feature_generator.generate_all_table_tasks(
                aggregations, task_type="imputation")

            # create feature imputation tables
            feature_generator.process_table_tasks(feature_table_imp_tasks)

            # build feature dictionaries from feature tables and
            # subsetting config
            master_feature_dict = feature_dictionary_creator.feature_dictionary(
                feature_table_names=feature_table_imp_tasks.keys(),
                index_column_lookup=feature_generator.index_column_lookup(
                    aggregations),
            )

            feature_dicts = feature_group_mixer.generate(
                feature_group_creator.subsets(master_feature_dict))

            # figure out what matrices need to be built
            _, matrix_build_tasks = planner.generate_plans(
                split_definitions, feature_dicts)

            # go and build the matrices
            builder.build_all_matrices(matrix_build_tasks)

            # super basic assertion: did matrices we expect get created?
            matrices_records = list(
                db_engine.execute(
                    """select matrix_uuid, num_observations, matrix_type
                    from triage_metadata.matrices
                    """))
            matrix_directory = os.path.join(temp_dir, "matrices")
            matrices = [
                path for path in os.listdir(matrix_directory) if ".csv" in path
            ]
            metadatas = [
                path for path in os.listdir(matrix_directory)
                if ".yaml" in path
            ]
            assert len(matrices) == num_split_matrices * \
                expected_matrix_multiplier
            assert len(metadatas) == num_split_matrices * \
                expected_matrix_multiplier
            assert len(matrices) == len(matrices_records)
            feature_group_name_lists = []
            for metadata_path in metadatas:
                with open(os.path.join(matrix_directory, metadata_path)) as f:
                    metadata = yaml.full_load(f)
                    feature_group_name_lists.append(metadata["feature_groups"])

            for matrix_uuid, num_observations, matrix_type in matrices_records:
                assert matrix_uuid in matrix_build_tasks  # the hashes of the matrices
                assert type(num_observations) is int
                assert matrix_type == matrix_build_tasks[matrix_uuid][
                    "matrix_type"]

            def deep_unique_tuple(l):
                return set([tuple(i) for i in l])

            assert deep_unique_tuple(
                feature_group_name_lists) == deep_unique_tuple(
                    expected_group_lists)
Example #26
0
 def test_unevenly_divisible_lookback_duration(self):
     expected_result = [
         {
             "feature_start_time": datetime.datetime(1990, 1, 1, 0, 0),
             "label_start_time": datetime.datetime(2010, 1, 1, 0, 0),
             "feature_end_time": datetime.datetime(2010, 1, 16, 0, 0),
             "label_end_time": datetime.datetime(2010, 1, 16, 0, 0),
             "train_matrix": {
                 "first_as_of_time": datetime.datetime(2010, 1, 1, 0, 0),
                 "last_as_of_time": datetime.datetime(2010, 1, 4, 0, 0),
                 "matrix_info_end_time": datetime.datetime(2010, 1, 5, 0, 0),
                 "as_of_times": [
                     datetime.datetime(2010, 1, 1, 0, 0),
                     datetime.datetime(2010, 1, 2, 0, 0),
                     datetime.datetime(2010, 1, 3, 0, 0),
                     datetime.datetime(2010, 1, 4, 0, 0),
                 ],
                 "training_label_timespan": "1 day",
                 "training_as_of_date_frequency": "1 days",
                 "max_training_history": "7 days",
             },
             "test_matrices": [
                 {
                     "first_as_of_time": datetime.datetime(2010, 1, 5, 0, 0),
                     "last_as_of_time": datetime.datetime(2010, 1, 9, 0, 0),
                     "matrix_info_end_time": datetime.datetime(2010, 1, 10, 0, 0),
                     "as_of_times": [
                         datetime.datetime(2010, 1, 5, 0, 0),
                         datetime.datetime(2010, 1, 6, 0, 0),
                         datetime.datetime(2010, 1, 7, 0, 0),
                         datetime.datetime(2010, 1, 8, 0, 0),
                         datetime.datetime(2010, 1, 9, 0, 0),
                     ],
                     "test_label_timespan": "1 day",
                     "test_as_of_date_frequency": "1 days",
                     "test_duration": "5 days",
                 }
             ],
         },
         {
             "feature_start_time": datetime.datetime(1990, 1, 1, 0, 0),
             "label_start_time": datetime.datetime(2010, 1, 1, 0, 0),
             "feature_end_time": datetime.datetime(2010, 1, 16, 0, 0),
             "label_end_time": datetime.datetime(2010, 1, 16, 0, 0),
             "train_matrix": {
                 "first_as_of_time": datetime.datetime(2010, 1, 2, 0, 0),
                 "last_as_of_time": datetime.datetime(2010, 1, 9, 0, 0),
                 "matrix_info_end_time": datetime.datetime(2010, 1, 10, 0, 0),
                 "as_of_times": [
                     datetime.datetime(2010, 1, 2, 0, 0),
                     datetime.datetime(2010, 1, 3, 0, 0),
                     datetime.datetime(2010, 1, 4, 0, 0),
                     datetime.datetime(2010, 1, 5, 0, 0),
                     datetime.datetime(2010, 1, 6, 0, 0),
                     datetime.datetime(2010, 1, 7, 0, 0),
                     datetime.datetime(2010, 1, 8, 0, 0),
                     datetime.datetime(2010, 1, 9, 0, 0),
                 ],
                 "training_label_timespan": "1 day",
                 "training_as_of_date_frequency": "1 days",
                 "max_training_history": "7 days",
             },
             "test_matrices": [
                 {
                     "first_as_of_time": datetime.datetime(2010, 1, 10, 0, 0),
                     "last_as_of_time": datetime.datetime(2010, 1, 14, 0, 0),
                     "matrix_info_end_time": datetime.datetime(2010, 1, 15, 0, 0),
                     "as_of_times": [
                         datetime.datetime(2010, 1, 10, 0, 0),
                         datetime.datetime(2010, 1, 11, 0, 0),
                         datetime.datetime(2010, 1, 12, 0, 0),
                         datetime.datetime(2010, 1, 13, 0, 0),
                         datetime.datetime(2010, 1, 14, 0, 0),
                     ],
                     "test_label_timespan": "1 day",
                     "test_as_of_date_frequency": "1 days",
                     "test_duration": "5 days",
                 }
             ],
         },
     ]
     chopper = Timechop(
         feature_start_time=datetime.datetime(1990, 1, 1, 0, 0),
         feature_end_time=datetime.datetime(2010, 1, 16, 0, 0),
         label_start_time=datetime.datetime(2010, 1, 1, 0, 0),
         label_end_time=datetime.datetime(2010, 1, 16, 0, 0),
         model_update_frequency="5 days",
         training_as_of_date_frequencies=["1 days"],
         test_as_of_date_frequencies=["1 days"],
         max_training_histories=["7 days"],
         test_durations=["5 days"],
         test_label_timespans=["1 day"],
         training_label_timespans=["1 day"],
     )
     result = chopper.chop_time()
     assert result == expected_result
Example #27
0
    def initialize_components(self):
        split_config = self.config["temporal_config"]

        self.chopper = Timechop(**split_config)

        cohort_config = self.config.get("cohort_config", {})
        if "query" in cohort_config:
            self.cohort_table_name = "cohort_{}_{}".format(
                cohort_config.get('name', 'default'), self.cohort_hash)
            self.cohort_table_generator = EntityDateTableGenerator(
                entity_date_table_name=self.cohort_table_name,
                db_engine=self.db_engine,
                query=cohort_config["query"],
                replace=self.replace)
        else:
            logging.warning(
                "cohort_config missing or unrecognized. Without a cohort, "
                "you will not be able to make matrices, perform feature imputation, "
                "or save time by only computing features for that cohort.")
            self.features_ignore_cohort = True
            self.cohort_table_name = "cohort_{}".format(self.experiment_hash)
            self.cohort_table_generator = EntityDateTableGeneratorNoOp()

        self.subsets = [None] + self.config.get("scoring", {}).get(
            "subsets", [])

        if "label_config" in self.config:
            label_config = self.config["label_config"]
            self.labels_table_name = "labels_{}_{}".format(
                label_config.get('name', 'default'),
                filename_friendly_hash(label_config['query']))
            self.label_generator = LabelGenerator(
                label_name=label_config.get("name", None),
                query=label_config["query"],
                replace=self.replace,
                db_engine=self.db_engine,
            )
        else:
            self.labels_table_name = "labels_{}".format(self.experiment_hash)
            self.label_generator = LabelGeneratorNoOp()
            logging.warning(
                "label_config missing or unrecognized. Without labels, "
                "you will not be able to make matrices.")

        if "bias_audit_config" in self.config:
            bias_config = self.config["bias_audit_config"]
            self.bias_hash = filename_friendly_hash(bias_config)
            self.protected_groups_table_name = f"protected_groups_{self.bias_hash}"
            self.protected_groups_generator = ProtectedGroupsGenerator(
                db_engine=self.db_engine,
                from_obj=parse_from_obj(bias_config, 'bias_from_obj'),
                attribute_columns=bias_config.get("attribute_columns", None),
                entity_id_column=bias_config.get("entity_id_column", None),
                knowledge_date_column=bias_config.get("knowledge_date_column",
                                                      None),
                protected_groups_table_name=self.protected_groups_table_name,
                replace=self.replace)
        else:
            self.protected_groups_generator = ProtectedGroupsGeneratorNoOp()
            logging.warning(
                "bias_audit_config missing or unrecognized. Without protected groups, "
                "you will not audit your models for bias and fairness.")

        self.feature_dictionary_creator = FeatureDictionaryCreator(
            features_schema_name=self.features_schema_name,
            db_engine=self.db_engine)

        self.feature_generator = FeatureGenerator(
            features_schema_name=self.features_schema_name,
            replace=self.replace,
            db_engine=self.db_engine,
            feature_start_time=split_config["feature_start_time"],
            materialize_subquery_fromobjs=self.materialize_subquery_fromobjs,
            features_ignore_cohort=self.features_ignore_cohort)

        self.feature_group_creator = FeatureGroupCreator(
            self.config.get("feature_group_definition", {"all": [True]}))

        self.feature_group_mixer = FeatureGroupMixer(
            self.config.get("feature_group_strategies", ["all"]))

        self.planner = Planner(
            feature_start_time=dt_from_str(split_config["feature_start_time"]),
            label_names=[
                self.config.get("label_config",
                                {}).get("name", DEFAULT_LABEL_NAME)
            ],
            label_types=["binary"],
            cohort_names=[
                self.config.get("cohort_config", {}).get("name", None)
            ],
            user_metadata=self.config.get("user_metadata", {}),
        )

        self.matrix_builder = MatrixBuilder(
            db_config={
                "features_schema_name": self.features_schema_name,
                "labels_schema_name": "public",
                "labels_table_name": self.labels_table_name,
                "cohort_table_name": self.cohort_table_name,
            },
            matrix_storage_engine=self.matrix_storage_engine,
            experiment_hash=self.experiment_hash,
            include_missing_labels_in_train_as=self.config.get(
                "label_config", {}).get("include_missing_labels_in_train_as",
                                        None),
            engine=self.db_engine,
            replace=self.replace,
            run_id=self.run_id,
        )

        self.subsetter = Subsetter(db_engine=self.db_engine,
                                   replace=self.replace,
                                   as_of_times=self.all_as_of_times)

        self.trainer = ModelTrainer(
            experiment_hash=self.experiment_hash,
            model_storage_engine=self.model_storage_engine,
            model_grouper=ModelGrouper(self.config.get("model_group_keys",
                                                       [])),
            db_engine=self.db_engine,
            replace=self.replace,
            run_id=self.run_id,
        )

        self.predictor = Predictor(
            db_engine=self.db_engine,
            model_storage_engine=self.model_storage_engine,
            save_predictions=self.save_predictions,
            replace=self.replace,
            rank_order=self.config.get("prediction",
                                       {}).get("rank_tiebreaker", "worst"),
        )

        self.individual_importance_calculator = IndividualImportanceCalculator(
            db_engine=self.db_engine,
            n_ranks=self.config.get("individual_importance",
                                    {}).get("n_ranks", 5),
            methods=self.config.get("individual_importance",
                                    {}).get("methods", ["uniform"]),
            replace=self.replace,
        )

        self.evaluator = ModelEvaluator(
            db_engine=self.db_engine,
            testing_metric_groups=self.config.get("scoring", {}).get(
                "testing_metric_groups", []),
            training_metric_groups=self.config.get("scoring", {}).get(
                "training_metric_groups", []),
            bias_config=self.config.get("bias_audit_config", {}))

        self.model_train_tester = ModelTrainTester(
            matrix_storage_engine=self.matrix_storage_engine,
            model_evaluator=self.evaluator,
            model_trainer=self.trainer,
            individual_importance_calculator=self.
            individual_importance_calculator,
            predictor=self.predictor,
            subsets=self.subsets,
            protected_groups_generator=self.protected_groups_generator,
            cohort_hash=self.cohort_hash)
Example #28
0
    def retrain(self, prediction_date):
        """Retrain a model by going back one split from prediction_date, so the as_of_date for training would be (prediction_date - training_label_timespan)
        
        Args:
            prediction_date(str) 
        """
        # Retrain config and hash
        retrain_config = {
            "model_group_id": self.model_group_id,
            "prediction_date": prediction_date,
            "test_label_timespan": self.test_label_timespan,
            "test_duration": self.test_duration,
        }
        self.retrain_hash = save_retrain_and_get_hash(retrain_config,
                                                      self.db_engine)

        with get_for_update(self.db_engine, Retrain,
                            self.retrain_hash) as retrain:
            retrain.prediction_date = prediction_date

        # Timechop
        prediction_date = dt_from_str(prediction_date)
        temporal_config = self.get_temporal_config_for_retrain(prediction_date)
        timechopper = Timechop(**temporal_config)
        chops = timechopper.chop_time()
        assert len(chops) == 1
        chops_train_matrix = chops[0]['train_matrix']
        as_of_date = datetime.strftime(chops_train_matrix['last_as_of_time'],
                                       "%Y-%m-%d")
        retrain_definition = {
            'first_as_of_time':
            chops_train_matrix['first_as_of_time'],
            'last_as_of_time':
            chops_train_matrix['last_as_of_time'],
            'matrix_info_end_time':
            chops_train_matrix['matrix_info_end_time'],
            'as_of_times': [as_of_date],
            'training_label_timespan':
            chops_train_matrix['training_label_timespan'],
            'max_training_history':
            chops_train_matrix['max_training_history'],
            'training_as_of_date_frequency':
            chops_train_matrix['training_as_of_date_frequency'],
        }

        # Set ExperimentRun
        run = TriageRun(
            start_time=datetime.now(),
            git_hash=infer_git_hash(),
            triage_version=infer_triage_version(),
            python_version=infer_python_version(),
            run_type="retrain",
            run_hash=self.retrain_hash,
            last_updated_time=datetime.now(),
            current_status=TriageRunStatus.started,
            installed_libraries=infer_installed_libraries(),
            platform=platform.platform(),
            os_user=getpass.getuser(),
            working_directory=os.getcwd(),
            ec2_instance_type=infer_ec2_instance_type(),
            log_location=infer_log_location(),
            experiment_class_path=classpath(self.__class__),
            random_seed=retrieve_experiment_seed_from_run_id(
                self.db_engine, self.triage_run_id),
        )
        run_id = None
        with scoped_session(self.db_engine) as session:
            session.add(run)
            session.commit()
            run_id = run.run_id
        if not run_id:
            raise ValueError("Failed to retrieve run_id from saved row")

        # set ModelTrainer's run_id and experiment_hash for Retrain run
        self.model_trainer.run_id = run_id
        self.model_trainer.experiment_hash = self.retrain_hash

        # 1. Generate all labels
        self.generate_all_labels(as_of_date)
        record_labels_table_name(run_id, self.db_engine,
                                 self.labels_table_name)

        # 2. Generate cohort
        cohort_table_name = f"triage_production.cohort_{self.experiment_config['cohort_config']['name']}_retrain"
        self.generate_entity_date_table(as_of_date, cohort_table_name)
        record_cohort_table_name(run_id, self.db_engine, cohort_table_name)

        # 3. Generate feature aggregations
        collate_aggregations = self.get_collate_aggregations(
            as_of_date, cohort_table_name)
        feature_aggregation_table_tasks = self.feature_generator.generate_all_table_tasks(
            collate_aggregations, task_type='aggregation')
        self.feature_generator.process_table_tasks(
            feature_aggregation_table_tasks)

        # 4. Reconstruct feature disctionary from feature_names and generate imputation
        reconstructed_feature_dict, imputation_table_tasks = self.get_feature_dict_and_imputation_task(
            collate_aggregations,
            self.model_group_info['model_id_last_split'],
        )
        feature_group_creator = FeatureGroupCreator(
            self.experiment_config['feature_group_definition'])
        feature_group_mixer = FeatureGroupMixer(["all"])
        feature_group_dict = feature_group_mixer.generate(
            feature_group_creator.subsets(reconstructed_feature_dict))[0]
        self.feature_generator.process_table_tasks(imputation_table_tasks)
        # 5. Build new matrix
        db_config = {
            "features_schema_name": "triage_production",
            "labels_schema_name": "public",
            "cohort_table_name": cohort_table_name,
            "labels_table_name": self.labels_table_name,
        }

        record_matrix_building_started(run_id, self.db_engine)
        matrix_builder = MatrixBuilder(
            db_config=db_config,
            matrix_storage_engine=self.matrix_storage_engine,
            engine=self.db_engine,
            experiment_hash=None,
            replace=True,
        )
        new_matrix_metadata = Planner.make_metadata(
            matrix_definition=retrain_definition,
            feature_dictionary=feature_group_dict,
            label_name=self.label_name,
            label_type='binary',
            cohort_name=self.cohort_name,
            matrix_type='train',
            feature_start_time=dt_from_str(self.feature_start_time),
            user_metadata=self.user_metadata,
        )

        new_matrix_metadata['matrix_id'] = "_".join([
            self.label_name,
            'binary',
            str(as_of_date),
            'retrain',
        ])

        matrix_uuid = filename_friendly_hash(new_matrix_metadata)
        matrix_builder.build_matrix(
            as_of_times=[as_of_date],
            label_name=self.label_name,
            label_type='binary',
            feature_dictionary=feature_group_dict,
            matrix_metadata=new_matrix_metadata,
            matrix_uuid=matrix_uuid,
            matrix_type="train",
        )
        retrain_model_comment = 'retrain_' + str(datetime.now())

        misc_db_parameters = {
            'train_end_time': dt_from_str(as_of_date),
            'test': False,
            'train_matrix_uuid': matrix_uuid,
            'training_label_timespan': self.training_label_timespan,
            'model_comment': retrain_model_comment,
        }

        # get the random seed from the last split
        last_split_train_matrix_uuid, last_split_matrix_metadata = train_matrix_info_from_model_id(
            self.db_engine,
            model_id=self.model_group_info['model_id_last_split'])

        random_seed = self.model_trainer.get_or_generate_random_seed(
            model_group_id=self.model_group_id,
            matrix_metadata=last_split_matrix_metadata,
            train_matrix_uuid=last_split_train_matrix_uuid)

        # create retrain model hash
        retrain_model_hash = self.model_trainer._model_hash(
            self.matrix_storage_engine.get_store(matrix_uuid).metadata,
            class_path=self.model_group_info['model_type'],
            parameters=self.model_group_info['hyperparameters'],
            random_seed=random_seed,
        )

        associate_models_with_retrain(self.retrain_hash,
                                      (retrain_model_hash, ), self.db_engine)

        record_model_building_started(run_id, self.db_engine)
        retrain_model_id = self.model_trainer.process_train_task(
            matrix_store=self.matrix_storage_engine.get_store(matrix_uuid),
            class_path=self.model_group_info['model_type'],
            parameters=self.model_group_info['hyperparameters'],
            model_hash=retrain_model_hash,
            misc_db_parameters=misc_db_parameters,
            random_seed=random_seed,
            retrain=True,
            model_group_id=self.model_group_id)

        self.retrain_model_hash = retrieve_model_hash_from_id(
            self.db_engine, retrain_model_id)
        self.retrain_matrix_uuid = matrix_uuid
        self.retrain_model_id = retrain_model_id
        return {
            'retrain_model_comment': retrain_model_comment,
            'retrain_model_id': retrain_model_id
        }