Esempio n. 1
0
def basic_integration_test(state_filters, feature_group_create_rules,
                           feature_group_mix_rules,
                           expected_matrix_multiplier):
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        Base.metadata.create_all(db_engine)
        populate_source_data(db_engine)

        with TemporaryDirectory() as temp_dir:
            chopper = Timechop(
                feature_start_time=datetime(2010, 1, 1),
                feature_end_time=datetime(2014, 1, 1),
                label_start_time=datetime(2011, 1, 1),
                label_end_time=datetime(2014, 1, 1),
                model_update_frequency='1year',
                training_label_timespans=['6months'],
                test_label_timespans=['6months'],
                training_as_of_date_frequencies='1day',
                test_as_of_date_frequencies='3months',
                max_training_histories=['1months'],
                test_durations=['1months'],
            )

            state_table_generator = StateTableGenerator(
                db_engine=db_engine,
                experiment_hash='abcd',
                dense_state_table='states',
            )

            label_generator = BinaryLabelGenerator(db_engine=db_engine,
                                                   events_table='events')

            feature_generator = FeatureGenerator(
                db_engine=db_engine,
                features_schema_name='features',
                replace=True,
            )

            feature_dictionary_creator = FeatureDictionaryCreator(
                db_engine=db_engine, features_schema_name='features')

            feature_group_creator = FeatureGroupCreator(
                feature_group_create_rules)

            feature_group_mixer = FeatureGroupMixer(feature_group_mix_rules)

            planner = Planner(engine=db_engine,
                              feature_start_time=datetime(2010, 1, 1),
                              label_names=['outcome'],
                              label_types=['binary'],
                              db_config={
                                  'features_schema_name':
                                  'features',
                                  'labels_schema_name':
                                  'public',
                                  'labels_table_name':
                                  'labels',
                                  'sparse_state_table_name':
                                  'tmp_sparse_states_abcd',
                              },
                              matrix_directory=os.path.join(
                                  temp_dir, 'matrices'),
                              states=state_filters,
                              user_metadata={},
                              replace=True)

            # chop time
            split_definitions = chopper.chop_time()
            num_split_matrices = sum(1 + len(split['test_matrices'])
                                     for split in split_definitions)

            # generate as_of_times for feature/label/state generation
            all_as_of_times = []
            for split in split_definitions:
                all_as_of_times.extend(split['train_matrix']['as_of_times'])
                for test_matrix in split['test_matrices']:
                    all_as_of_times.extend(test_matrix['as_of_times'])
            all_as_of_times = list(set(all_as_of_times))

            feature_aggregation_config = [{
                'prefix':
                'cat',
                'from_obj':
                'cat_complaints',
                'knowledge_date_column':
                'as_of_date',
                'aggregates': [{
                    'quantity': 'cat_sightings',
                    'metrics': ['count', 'avg'],
                    'imputation': {
                        'all': {
                            'type': 'mean'
                        }
                    }
                }],
                'intervals': ['1y'],
                'groups': ['entity_id']
            }, {
                'prefix':
                'dog',
                'from_obj':
                'dog_complaints',
                'knowledge_date_column':
                'as_of_date',
                'aggregates_imputation': {
                    'count': {
                        'type': 'constant',
                        'value': 7
                    },
                    'sum': {
                        'type': 'mean'
                    },
                    'avg': {
                        'type': 'zero'
                    }
                },
                'aggregates': [{
                    'quantity': 'dog_sightings',
                    'metrics': ['count', 'avg'],
                }],
                'intervals': ['1y'],
                'groups': ['entity_id']
            }]

            state_table_generator.validate()
            label_generator.validate()
            feature_generator.validate(feature_aggregation_config)
            feature_group_creator.validate()
            planner.validate()

            # generate sparse state table
            state_table_generator.generate_sparse_table(
                as_of_dates=all_as_of_times)

            # create labels table
            label_generator.generate_all_labels(labels_table='labels',
                                                as_of_dates=all_as_of_times,
                                                label_timespans=['6months'])

            # create feature table tasks
            # we would use FeatureGenerator#create_all_tables but want to use
            # the tasks dict directly to create a feature dict
            aggregations = feature_generator.aggregations(
                feature_aggregation_config=[{
                    'prefix':
                    'cat',
                    'from_obj':
                    'cat_complaints',
                    'knowledge_date_column':
                    'as_of_date',
                    'aggregates': [{
                        'quantity': 'cat_sightings',
                        'metrics': ['count', 'avg'],
                        'imputation': {
                            'all': {
                                'type': 'mean'
                            }
                        }
                    }],
                    'intervals': ['1y'],
                    'groups': ['entity_id']
                }, {
                    'prefix':
                    'dog',
                    'from_obj':
                    'dog_complaints',
                    'knowledge_date_column':
                    'as_of_date',
                    'aggregates_imputation': {
                        'count': {
                            'type': 'constant',
                            'value': 7
                        },
                        'sum': {
                            'type': 'mean'
                        },
                        'avg': {
                            'type': 'zero'
                        }
                    },
                    'aggregates': [{
                        'quantity': 'dog_sightings',
                        'metrics': ['count', 'avg'],
                    }],
                    'intervals': ['1y'],
                    'groups': ['entity_id']
                }],
                feature_dates=all_as_of_times,
                state_table=state_table_generator.sparse_table_name)
            feature_table_agg_tasks = feature_generator.generate_all_table_tasks(
                aggregations, task_type='aggregation')

            # create feature aggregation tables
            feature_generator.process_table_tasks(feature_table_agg_tasks)

            feature_table_imp_tasks = feature_generator.generate_all_table_tasks(
                aggregations, task_type='imputation')

            # create feature imputation tables
            feature_generator.process_table_tasks(feature_table_imp_tasks)

            # build feature dictionaries from feature tables and
            # subsetting config
            master_feature_dict = feature_dictionary_creator.feature_dictionary(
                feature_table_names=feature_table_imp_tasks.keys(),
                index_column_lookup=feature_generator.index_column_lookup(
                    aggregations))

            feature_dicts = feature_group_mixer.generate(
                feature_group_creator.subsets(master_feature_dict))

            # figure out what matrices need to be built
            _, matrix_build_tasks =\
                planner.generate_plans(
                    split_definitions,
                    feature_dicts
                )

            # go and build the matrices
            planner.build_all_matrices(matrix_build_tasks)

            # super basic assertion: did matrices we expect get created?
            matrix_directory = os.path.join(temp_dir, 'matrices')
            matrices = [
                path for path in os.listdir(matrix_directory) if '.csv' in path
            ]
            metadatas = [
                path for path in os.listdir(matrix_directory)
                if '.yaml' in path
            ]
            assert len(
                matrices) == num_split_matrices * expected_matrix_multiplier
            assert len(
                metadatas) == num_split_matrices * expected_matrix_multiplier
Esempio n. 2
0
    def initialize_components(self):
        split_config = self.config["temporal_config"]

        self.chopper = Timechop(**split_config)

        cohort_config = self.config.get("cohort_config", {})
        if "query" in cohort_config:
            self.cohort_table_name = "cohort_{}_{}".format(
                cohort_config.get('name', 'default'), self.cohort_hash)
            self.cohort_table_generator = EntityDateTableGenerator(
                entity_date_table_name=self.cohort_table_name,
                db_engine=self.db_engine,
                query=cohort_config["query"],
                replace=self.replace)
        else:
            logging.warning(
                "cohort_config missing or unrecognized. Without a cohort, "
                "you will not be able to make matrices, perform feature imputation, "
                "or save time by only computing features for that cohort.")
            self.features_ignore_cohort = True
            self.cohort_table_name = "cohort_{}".format(self.experiment_hash)
            self.cohort_table_generator = EntityDateTableGeneratorNoOp()

        self.subsets = [None] + self.config.get("scoring", {}).get(
            "subsets", [])

        if "label_config" in self.config:
            label_config = self.config["label_config"]
            self.labels_table_name = "labels_{}_{}".format(
                label_config.get('name', 'default'),
                filename_friendly_hash(label_config['query']))
            self.label_generator = LabelGenerator(
                label_name=label_config.get("name", None),
                query=label_config["query"],
                replace=self.replace,
                db_engine=self.db_engine,
            )
        else:
            self.labels_table_name = "labels_{}".format(self.experiment_hash)
            self.label_generator = LabelGeneratorNoOp()
            logging.warning(
                "label_config missing or unrecognized. Without labels, "
                "you will not be able to make matrices.")

        if "bias_audit_config" in self.config:
            bias_config = self.config["bias_audit_config"]
            self.bias_hash = filename_friendly_hash(bias_config)
            self.protected_groups_table_name = f"protected_groups_{self.bias_hash}"
            self.protected_groups_generator = ProtectedGroupsGenerator(
                db_engine=self.db_engine,
                from_obj=parse_from_obj(bias_config, 'bias_from_obj'),
                attribute_columns=bias_config.get("attribute_columns", None),
                entity_id_column=bias_config.get("entity_id_column", None),
                knowledge_date_column=bias_config.get("knowledge_date_column",
                                                      None),
                protected_groups_table_name=self.protected_groups_table_name,
                replace=self.replace)
        else:
            self.protected_groups_generator = ProtectedGroupsGeneratorNoOp()
            logging.warning(
                "bias_audit_config missing or unrecognized. Without protected groups, "
                "you will not audit your models for bias and fairness.")

        self.feature_dictionary_creator = FeatureDictionaryCreator(
            features_schema_name=self.features_schema_name,
            db_engine=self.db_engine)

        self.feature_generator = FeatureGenerator(
            features_schema_name=self.features_schema_name,
            replace=self.replace,
            db_engine=self.db_engine,
            feature_start_time=split_config["feature_start_time"],
            materialize_subquery_fromobjs=self.materialize_subquery_fromobjs,
            features_ignore_cohort=self.features_ignore_cohort)

        self.feature_group_creator = FeatureGroupCreator(
            self.config.get("feature_group_definition", {"all": [True]}))

        self.feature_group_mixer = FeatureGroupMixer(
            self.config.get("feature_group_strategies", ["all"]))

        self.planner = Planner(
            feature_start_time=dt_from_str(split_config["feature_start_time"]),
            label_names=[
                self.config.get("label_config",
                                {}).get("name", DEFAULT_LABEL_NAME)
            ],
            label_types=["binary"],
            cohort_names=[
                self.config.get("cohort_config", {}).get("name", None)
            ],
            user_metadata=self.config.get("user_metadata", {}),
        )

        self.matrix_builder = MatrixBuilder(
            db_config={
                "features_schema_name": self.features_schema_name,
                "labels_schema_name": "public",
                "labels_table_name": self.labels_table_name,
                "cohort_table_name": self.cohort_table_name,
            },
            matrix_storage_engine=self.matrix_storage_engine,
            experiment_hash=self.experiment_hash,
            include_missing_labels_in_train_as=self.config.get(
                "label_config", {}).get("include_missing_labels_in_train_as",
                                        None),
            engine=self.db_engine,
            replace=self.replace,
            run_id=self.run_id,
        )

        self.subsetter = Subsetter(db_engine=self.db_engine,
                                   replace=self.replace,
                                   as_of_times=self.all_as_of_times)

        self.trainer = ModelTrainer(
            experiment_hash=self.experiment_hash,
            model_storage_engine=self.model_storage_engine,
            model_grouper=ModelGrouper(self.config.get("model_group_keys",
                                                       [])),
            db_engine=self.db_engine,
            replace=self.replace,
            run_id=self.run_id,
        )

        self.predictor = Predictor(
            db_engine=self.db_engine,
            model_storage_engine=self.model_storage_engine,
            save_predictions=self.save_predictions,
            replace=self.replace,
            rank_order=self.config.get("prediction",
                                       {}).get("rank_tiebreaker", "worst"),
        )

        self.individual_importance_calculator = IndividualImportanceCalculator(
            db_engine=self.db_engine,
            n_ranks=self.config.get("individual_importance",
                                    {}).get("n_ranks", 5),
            methods=self.config.get("individual_importance",
                                    {}).get("methods", ["uniform"]),
            replace=self.replace,
        )

        self.evaluator = ModelEvaluator(
            db_engine=self.db_engine,
            testing_metric_groups=self.config.get("scoring", {}).get(
                "testing_metric_groups", []),
            training_metric_groups=self.config.get("scoring", {}).get(
                "training_metric_groups", []),
            bias_config=self.config.get("bias_audit_config", {}))

        self.model_train_tester = ModelTrainTester(
            matrix_storage_engine=self.matrix_storage_engine,
            model_evaluator=self.evaluator,
            model_trainer=self.trainer,
            individual_importance_calculator=self.
            individual_importance_calculator,
            predictor=self.predictor,
            subsets=self.subsets,
            protected_groups_generator=self.protected_groups_generator,
            cohort_hash=self.cohort_hash)
Esempio n. 3
0
    def initialize_components(self):
        split_config = self.config['temporal_config']

        self.chopper = Timechop(**split_config)

        cohort_config = self.config.get('cohort_config', {})
        if 'query' in cohort_config:
            self.state_table_generator = StateTableGeneratorFromQuery(
                experiment_hash=self.experiment_hash,
                db_engine=self.db_engine,
                query=cohort_config['query']
            )
        elif 'entities_table' in cohort_config:
            self.state_table_generator = StateTableGeneratorFromEntities(
                experiment_hash=self.experiment_hash,
                db_engine=self.db_engine,
                entities_table=cohort_config['entities_table']
            )
        elif 'dense_states' in cohort_config:
            self.state_table_generator = StateTableGeneratorFromDense(
                experiment_hash=self.experiment_hash,
                db_engine=self.db_engine,
                dense_state_table=cohort_config['dense_states']['table_name']
            )
        else:
            logging.warning('cohort_config missing or unrecognized. Without a cohort, you will not be able to make matrices or perform feature imputation.')
            self.state_table_generator = StateTableGeneratorNoOp()

        if 'label_config' in self.config:
            self.label_generator = LabelGenerator(
                label_name=self.config['label_config'].get('name', None),
                query=self.config['label_config']['query'],
                db_engine=self.db_engine,
            )
        else:
            self.label_generator = LabelGeneratorNoOp()
            logging.warning('label_config missing or unrecognized. Without labels, you will not be able to make matrices.')

        self.feature_dictionary_creator = FeatureDictionaryCreator(
            features_schema_name=self.features_schema_name,
            db_engine=self.db_engine,
        )

        self.feature_generator = FeatureGenerator(
            features_schema_name=self.features_schema_name,
            replace=self.replace,
            db_engine=self.db_engine,
            feature_start_time=split_config['feature_start_time']
        )

        self.feature_group_creator = FeatureGroupCreator(
            self.config.get('feature_group_definition', {'all': [True]})
        )

        self.feature_group_mixer = FeatureGroupMixer(
            self.config.get('feature_group_strategies', ['all'])
        )

        self.planner = Planner(
            feature_start_time=dt_from_str(split_config['feature_start_time']),
            label_names=[self.config.get('label_config', {}).get('name', DEFAULT_LABEL_NAME)],
            label_types=['binary'],
            cohort_name=self.config.get('cohort_config', {}).get('name', None),
            states=self.config.get('cohort_config', {}).get('dense_states', {})
            .get('state_filters', []),
            user_metadata=self.config.get('user_metadata', {}),
        )

        self.matrix_builder = MatrixBuilder(
            db_config={
                'features_schema_name': self.features_schema_name,
                'labels_schema_name': 'public',
                'labels_table_name': self.labels_table_name,
                # TODO: have planner/builder take state table later on, so we
                # can grab it from the StateTableGenerator instead of
                # duplicating it here
                'sparse_state_table_name': self.sparse_states_table_name,
            },
            matrix_storage_engine=self.matrix_storage_engine,
            include_missing_labels_in_train_as=self.config.get('label_config', {})
            .get('include_missing_labels_in_train_as', None),
            engine=self.db_engine,
            replace=self.replace
        )

        self.trainer = ModelTrainer(
            experiment_hash=self.experiment_hash,
            model_storage_engine=self.model_storage_engine,
            model_grouper=ModelGrouper(self.config.get('model_group_keys', [])),
            db_engine=self.db_engine,
            replace=self.replace
        )

        self.tester = ModelTester(
            model_storage_engine=self.model_storage_engine,
            matrix_storage_engine=self.matrix_storage_engine,
            replace=self.replace,
            db_engine=self.db_engine,
            individual_importance_config=self.config.get('individual_importance', {}),
            evaluator_config=self.config.get('scoring', {})
        )
Esempio n. 4
0
def basic_integration_test(
    cohort_names,
    feature_group_create_rules,
    feature_group_mix_rules,
    expected_matrix_multiplier,
    expected_group_lists,
):
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        Base.metadata.create_all(db_engine)
        populate_source_data(db_engine)

        with TemporaryDirectory() as temp_dir:
            chopper = Timechop(
                feature_start_time=datetime(2010, 1, 1),
                feature_end_time=datetime(2014, 1, 1),
                label_start_time=datetime(2011, 1, 1),
                label_end_time=datetime(2014, 1, 1),
                model_update_frequency="1year",
                training_label_timespans=["6months"],
                test_label_timespans=["6months"],
                training_as_of_date_frequencies="1day",
                test_as_of_date_frequencies="3months",
                max_training_histories=["1months"],
                test_durations=["1months"],
            )

            entity_date_table_generator = EntityDateTableGenerator(
                db_engine=db_engine,
                entity_date_table_name="cohort_abcd",
                query="select distinct(entity_id) from events")

            label_generator = LabelGenerator(
                db_engine=db_engine,
                query=sample_config()["label_config"]["query"])

            feature_generator = FeatureGenerator(
                db_engine=db_engine,
                features_schema_name="features",
                replace=True)

            feature_dictionary_creator = FeatureDictionaryCreator(
                db_engine=db_engine, features_schema_name="features")

            feature_group_creator = FeatureGroupCreator(
                feature_group_create_rules)

            feature_group_mixer = FeatureGroupMixer(feature_group_mix_rules)
            project_storage = ProjectStorage(temp_dir)
            planner = Planner(
                feature_start_time=datetime(2010, 1, 1),
                label_names=["outcome"],
                label_types=["binary"],
                cohort_names=cohort_names,
                user_metadata={},
            )

            builder = MatrixBuilder(
                engine=db_engine,
                db_config={
                    "features_schema_name": "features",
                    "labels_schema_name": "public",
                    "labels_table_name": "labels",
                    "cohort_table_name": "cohort_abcd",
                },
                experiment_hash=None,
                matrix_storage_engine=project_storage.matrix_storage_engine(),
                replace=True,
            )

            # chop time
            split_definitions = chopper.chop_time()
            num_split_matrices = sum(1 + len(split["test_matrices"])
                                     for split in split_definitions)

            # generate as_of_times for feature/label/state generation
            all_as_of_times = []
            for split in split_definitions:
                all_as_of_times.extend(split["train_matrix"]["as_of_times"])
                for test_matrix in split["test_matrices"]:
                    all_as_of_times.extend(test_matrix["as_of_times"])
            all_as_of_times = list(set(all_as_of_times))

            # generate entity_date state table
            entity_date_table_generator.generate_entity_date_table(
                as_of_dates=all_as_of_times)

            # create labels table
            label_generator.generate_all_labels(
                labels_table="labels",
                as_of_dates=all_as_of_times,
                label_timespans=["6months"],
            )

            # create feature table tasks
            # we would use FeatureGenerator#create_all_tables but want to use
            # the tasks dict directly to create a feature dict
            aggregations = feature_generator.aggregations(
                feature_aggregation_config=[
                    {
                        "prefix":
                        "cat",
                        "from_obj":
                        "cat_complaints",
                        "knowledge_date_column":
                        "as_of_date",
                        "aggregates": [{
                            "quantity": "cat_sightings",
                            "metrics": ["count", "avg"],
                            "imputation": {
                                "all": {
                                    "type": "mean"
                                }
                            },
                        }],
                        "intervals": ["1y"],
                        "groups": ["entity_id"],
                    },
                    {
                        "prefix":
                        "dog",
                        "from_obj":
                        "dog_complaints",
                        "knowledge_date_column":
                        "as_of_date",
                        "aggregates_imputation": {
                            "count": {
                                "type": "constant",
                                "value": 7
                            },
                            "sum": {
                                "type": "mean"
                            },
                            "avg": {
                                "type": "zero"
                            },
                        },
                        "aggregates": [{
                            "quantity": "dog_sightings",
                            "metrics": ["count", "avg"]
                        }],
                        "intervals": ["1y"],
                        "groups": ["entity_id"],
                    },
                ],
                feature_dates=all_as_of_times,
                state_table=entity_date_table_generator.entity_date_table_name,
            )
            feature_table_agg_tasks = feature_generator.generate_all_table_tasks(
                aggregations, task_type="aggregation")

            # create feature aggregation tables
            feature_generator.process_table_tasks(feature_table_agg_tasks)

            feature_table_imp_tasks = feature_generator.generate_all_table_tasks(
                aggregations, task_type="imputation")

            # create feature imputation tables
            feature_generator.process_table_tasks(feature_table_imp_tasks)

            # build feature dictionaries from feature tables and
            # subsetting config
            master_feature_dict = feature_dictionary_creator.feature_dictionary(
                feature_table_names=feature_table_imp_tasks.keys(),
                index_column_lookup=feature_generator.index_column_lookup(
                    aggregations),
            )

            feature_dicts = feature_group_mixer.generate(
                feature_group_creator.subsets(master_feature_dict))

            # figure out what matrices need to be built
            _, matrix_build_tasks = planner.generate_plans(
                split_definitions, feature_dicts)

            # go and build the matrices
            builder.build_all_matrices(matrix_build_tasks)

            # super basic assertion: did matrices we expect get created?
            matrices_records = list(
                db_engine.execute(
                    """select matrix_uuid, num_observations, matrix_type
                    from triage_metadata.matrices
                    """))
            matrix_directory = os.path.join(temp_dir, "matrices")
            matrices = [
                path for path in os.listdir(matrix_directory) if ".csv" in path
            ]
            metadatas = [
                path for path in os.listdir(matrix_directory)
                if ".yaml" in path
            ]
            assert len(matrices) == num_split_matrices * \
                expected_matrix_multiplier
            assert len(metadatas) == num_split_matrices * \
                expected_matrix_multiplier
            assert len(matrices) == len(matrices_records)
            feature_group_name_lists = []
            for metadata_path in metadatas:
                with open(os.path.join(matrix_directory, metadata_path)) as f:
                    metadata = yaml.full_load(f)
                    feature_group_name_lists.append(metadata["feature_groups"])

            for matrix_uuid, num_observations, matrix_type in matrices_records:
                assert matrix_uuid in matrix_build_tasks  # the hashes of the matrices
                assert type(num_observations) is int
                assert matrix_type == matrix_build_tasks[matrix_uuid][
                    "matrix_type"]

            def deep_unique_tuple(l):
                return set([tuple(i) for i in l])

            assert deep_unique_tuple(
                feature_group_name_lists) == deep_unique_tuple(
                    expected_group_lists)
Esempio n. 5
0
    def initialize_components(self):
        split_config = self.config["temporal_config"]

        self.chopper = Timechop(**split_config)

        cohort_config = self.config.get("cohort_config", {})
        if "query" in cohort_config:
            self.cohort_table_generator = CohortTableGenerator(
                cohort_table_name=self.cohort_table_name,
                db_engine=self.db_engine,
                query=cohort_config["query"],
                replace=self.replace
            )
        else:
            logging.warning(
                "cohort_config missing or unrecognized. Without a cohort, "
                "you will not be able to make matrices or perform feature imputation."
            )
            self.cohort_table_generator = CohortTableGeneratorNoOp()

        if "label_config" in self.config:
            self.label_generator = LabelGenerator(
                label_name=self.config["label_config"].get("name", None),
                query=self.config["label_config"]["query"],
                replace=self.replace,
                db_engine=self.db_engine,
            )
        else:
            self.label_generator = LabelGeneratorNoOp()
            logging.warning(
                "label_config missing or unrecognized. Without labels, "
                "you will not be able to make matrices."
            )

        self.feature_dictionary_creator = FeatureDictionaryCreator(
            features_schema_name=self.features_schema_name, db_engine=self.db_engine
        )

        self.feature_generator = FeatureGenerator(
            features_schema_name=self.features_schema_name,
            replace=self.replace,
            db_engine=self.db_engine,
            feature_start_time=split_config["feature_start_time"],
            materialize_subquery_fromobjs=self.materialize_subquery_fromobjs
        )

        self.feature_group_creator = FeatureGroupCreator(
            self.config.get("feature_group_definition", {"all": [True]})
        )

        self.feature_group_mixer = FeatureGroupMixer(
            self.config.get("feature_group_strategies", ["all"])
        )

        self.planner = Planner(
            feature_start_time=dt_from_str(split_config["feature_start_time"]),
            label_names=[
                self.config.get("label_config", {}).get("name", DEFAULT_LABEL_NAME)
            ],
            label_types=["binary"],
            cohort_names=[self.config.get("cohort_config", {}).get("name", None)],
            user_metadata=self.config.get("user_metadata", {}),
        )

        self.matrix_builder = MatrixBuilder(
            db_config={
                "features_schema_name": self.features_schema_name,
                "labels_schema_name": "public",
                "labels_table_name": self.labels_table_name,
                "cohort_table_name": self.cohort_table_name,
            },
            matrix_storage_engine=self.matrix_storage_engine,
            experiment_hash=self.experiment_hash,
            include_missing_labels_in_train_as=self.config.get("label_config", {}).get(
                "include_missing_labels_in_train_as", None
            ),
            engine=self.db_engine,
            replace=self.replace,
        )

        self.trainer = ModelTrainer(
            experiment_hash=self.experiment_hash,
            model_storage_engine=self.model_storage_engine,
            model_grouper=ModelGrouper(self.config.get("model_group_keys", [])),
            db_engine=self.db_engine,
            replace=self.replace,
        )

        self.tester = ModelTester(
            model_storage_engine=self.model_storage_engine,
            matrix_storage_engine=self.matrix_storage_engine,
            replace=self.replace,
            db_engine=self.db_engine,
            individual_importance_config=self.config.get("individual_importance", {}),
            evaluator_config=self.config.get("scoring", {}),
        )
Esempio n. 6
0
    def initialize_components(self):
        split_config = self.config['temporal_config']

        self.chopper = Timechop(
            feature_start_time=dt_from_str(split_config['feature_start_time']),
            feature_end_time=dt_from_str(split_config['feature_end_time']),
            label_start_time=dt_from_str(split_config['label_start_time']),
            label_end_time=dt_from_str(split_config['label_end_time']),
            model_update_frequency=split_config['model_update_frequency'],
            training_label_timespans=split_config['training_label_timespans'],
            test_label_timespans=split_config['test_label_timespans'],
            training_as_of_date_frequencies=split_config[
                'training_as_of_date_frequencies'],
            test_as_of_date_frequencies=split_config[
                'test_as_of_date_frequencies'],
            max_training_histories=split_config['max_training_histories'],
            test_durations=split_config['test_durations'],
        )

        cohort_config = self.config.get('cohort_config', {})
        if 'query' in cohort_config:
            self.state_table_generator = StateTableGeneratorFromQuery(
                experiment_hash=self.experiment_hash,
                db_engine=self.db_engine,
                query=cohort_config['query'])
        elif 'entities_table' in cohort_config:
            self.state_table_generator = StateTableGeneratorFromEntities(
                experiment_hash=self.experiment_hash,
                db_engine=self.db_engine,
                entities_table=cohort_config['entities_table'])
        elif 'dense_states' in cohort_config:
            self.state_table_generator = StateTableGeneratorFromDense(
                experiment_hash=self.experiment_hash,
                db_engine=self.db_engine,
                dense_state_table=cohort_config['dense_states']['table_name'])
        else:
            raise ValueError('Cohort config missing or unrecognized')

        self.label_generator = LabelGenerator(
            label_name=self.config['label_config'].get('name', None),
            query=self.config['label_config']['query'],
            db_engine=self.db_engine,
        )

        self.feature_dictionary_creator = FeatureDictionaryCreator(
            features_schema_name=self.features_schema_name,
            db_engine=self.db_engine,
        )

        self.feature_generator = FeatureGenerator(
            features_schema_name=self.features_schema_name,
            replace=self.replace,
            db_engine=self.db_engine,
            feature_start_time=split_config['feature_start_time'])

        self.feature_group_creator = FeatureGroupCreator(
            self.config.get('feature_group_definition', {'all': [True]}))

        self.feature_group_mixer = FeatureGroupMixer(
            self.config.get('feature_group_strategies', ['all']))

        self.planner = Planner(
            feature_start_time=dt_from_str(split_config['feature_start_time']),
            label_names=[
                self.config.get('label_config',
                                {}).get('name', DEFAULT_LABEL_NAME)
            ],
            label_types=['binary'],
            matrix_directory=self.matrices_directory,
            cohort_name=self.config.get('cohort_config', {}).get('name', None),
            states=self.config.get('cohort_config',
                                   {}).get('dense_states',
                                           {}).get('state_filters', []),
            user_metadata=self.config.get('user_metadata', {}),
        )

        self.matrix_builder = HighMemoryCSVBuilder(
            db_config={
                'features_schema_name':
                self.features_schema_name,
                'labels_schema_name':
                'public',
                'labels_table_name':
                self.labels_table_name,
                # TODO: have planner/builder take state table later on, so we
                # can grab it from the StateTableGenerator instead of
                # duplicating it here
                'sparse_state_table_name':
                'tmp_sparse_states_{}'.format(self.experiment_hash),
            },
            matrix_directory=self.matrices_directory,
            include_missing_labels_in_train_as=self.config['label_config'].get(
                'include_missing_labels_in_train_as', None),
            engine=self.db_engine,
            replace=self.replace)

        self.trainer = ModelTrainer(
            project_path=self.project_path,
            experiment_hash=self.experiment_hash,
            model_storage_engine=self.model_storage_engine,
            model_grouper=ModelGrouper(self.config.get('model_group_keys',
                                                       [])),
            db_engine=self.db_engine,
            replace=self.replace)

        self.tester = ModelTester(
            model_storage_engine=self.model_storage_engine,
            project_path=self.project_path,
            replace=self.replace,
            db_engine=self.db_engine,
            individual_importance_config=self.config.get(
                'individual_importance', {}),
            evaluator_config=self.config.get('scoring', {}))
def test_feature_dictionary_creator():
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        engine.execute("create schema features")
        engine.execute(
            """
            create table features.prefix1_entity_id (
                entity_id int,
                as_of_date date,
                feature_one float,
                feature_two float
            )
        """
        )
        engine.execute(
            """
            create table features.prefix1_zipcode (
                zipcode text,
                as_of_date date,
                feature_three float,
                feature_four float
            )
        """
        )
        engine.execute(
            """
            create table features.prefix1_aggregation (
                entity_id int,
                as_of_date date,
                zipcode text,
                feature_one float,
                feature_two float,
                feature_three float,
                feature_four float
            )
        """
        )
        engine.execute(
            """
            create table features.prefix1_aggregation_imputed (
                entity_id int,
                as_of_date date,
                zipcode text,
                feature_one float,
                feature_two float,
                feature_three float,
                feature_three_imp int,
                feature_four float
            )
        """
        )
        engine.execute(
            """
            create table features.random_other_table (
                another_column float
            )
        """
        )

        creator = FeatureDictionaryCreator(
            features_schema_name="features", db_engine=engine
        )
        feature_dictionary = creator.feature_dictionary(
            feature_table_names=[
                "prefix1_entity_id",
                "prefix1_zip_code",
                "prefix1_aggregation",
                "prefix1_aggregation_imputed",
            ],
            index_column_lookup={
                "prefix1_aggregation_imputed": ["entity_id", "zipcode", "as_of_date"]
            },
        )
        assert feature_dictionary == {
            "prefix1_aggregation_imputed": [
                "feature_one",
                "feature_two",
                "feature_three",
                "feature_three_imp",
                "feature_four",
            ]
        }
Esempio n. 8
0
    def __init__(self, db_engine, project_path, model_group_id):
        self.retrain_hash = None
        self.db_engine = db_engine
        upgrade_db(db_engine=self.db_engine)
        self.project_storage = ProjectStorage(project_path)
        self.model_group_id = model_group_id
        self.model_group_info = get_model_group_info(self.db_engine,
                                                     self.model_group_id)
        self.matrix_storage_engine = self.project_storage.matrix_storage_engine(
        )
        self.triage_run_id, self.experiment_config = experiment_config_from_model_group_id(
            self.db_engine, self.model_group_id)

        # This feels like it needs some refactoring since in some edge cases at least the test matrix temporal parameters
        # might differ across models in the mdoel group (the training ones shouldn't), but this should probably work for
        # the vast majorty of use cases...
        self.experiment_config['temporal_config'].update(
            temporal_params_from_matrix_metadata(
                self.db_engine, self.model_group_info['model_id_last_split']))

        # Since "testing" here is predicting forward to a single new date, the test_duration should always be '0day'
        # (regardless of what it may have been before)
        self.experiment_config['temporal_config']['test_durations'] = ['0day']

        # These lists should now only contain one item (the value actually used for the last model in this group)
        self.training_label_timespan = self.experiment_config[
            'temporal_config']['training_label_timespans'][0]
        self.test_label_timespan = self.experiment_config['temporal_config'][
            'test_label_timespans'][0]
        self.test_duration = self.experiment_config['temporal_config'][
            'test_durations'][0]
        self.feature_start_time = self.experiment_config['temporal_config'][
            'feature_start_time']

        self.label_name = self.experiment_config['label_config']['name']
        self.cohort_name = self.experiment_config['cohort_config']['name']
        self.user_metadata = self.experiment_config['user_metadata']

        self.feature_dictionary_creator = FeatureDictionaryCreator(
            features_schema_name='triage_production', db_engine=self.db_engine)
        self.label_generator = LabelGenerator(
            label_name=self.experiment_config['label_config'].get(
                "name", None),
            query=self.experiment_config['label_config']["query"],
            replace=True,
            db_engine=self.db_engine,
        )

        self.labels_table_name = "labels_{}_{}_production".format(
            self.experiment_config['label_config'].get('name', 'default'),
            filename_friendly_hash(
                self.experiment_config['label_config']['query']))

        self.feature_generator = FeatureGenerator(
            db_engine=self.db_engine,
            features_schema_name="triage_production",
            feature_start_time=self.feature_start_time,
        )

        self.model_trainer = ModelTrainer(
            experiment_hash=None,
            model_storage_engine=ModelStorageEngine(self.project_storage),
            db_engine=self.db_engine,
            replace=True,
            run_id=self.triage_run_id,
        )
def test_feature_dictionary_creator():
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        engine.execute('create schema features')
        engine.execute('''
            create table features.prefix1_entity_id (
                entity_id int,
                as_of_date date,
                feature_one float,
                feature_two float
            )
        ''')
        engine.execute('''
            create table features.prefix1_zipcode (
                zipcode text,
                as_of_date date,
                feature_three float,
                feature_four float
            )
        ''')
        engine.execute('''
            create table features.prefix1_aggregation (
                entity_id int,
                as_of_date date,
                zipcode text,
                feature_one float,
                feature_two float,
                feature_three float,
                feature_four float
            )
        ''')
        engine.execute('''
            create table features.prefix1_aggregation_imputed (
                entity_id int,
                as_of_date date,
                zipcode text,
                feature_one float,
                feature_two float,
                feature_three float,
                feature_three_imp int,
                feature_four float
            )
        ''')
        engine.execute('''
            create table features.random_other_table (
                another_column float
            )
        ''')

        creator = FeatureDictionaryCreator(features_schema_name='features',
                                           db_engine=engine)
        feature_dictionary = creator.feature_dictionary(
            feature_table_names=[
                'prefix1_entity_id', 'prefix1_zip_code', 'prefix1_aggregation',
                'prefix1_aggregation_imputed'
            ],
            index_column_lookup={
                'prefix1_aggregation_imputed':
                ['entity_id', 'zipcode', 'as_of_date']
            })
        assert feature_dictionary == {
            'prefix1_aggregation_imputed': [
                'feature_one', 'feature_two', 'feature_three',
                'feature_three_imp', 'feature_four'
            ],
        }