def test_empty_output():
    """An empty cohort table eagerly produces an error.

    (Rather than allowing execution to proceed.)

    """
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        utils.create_binary_outcome_events(engine, "events", [])
        table_generator = EntityDateTableGenerator(
            query=
            "select entity_id from events where outcome_date < '{as_of_date}'::date",
            db_engine=engine,
            entity_date_table_name="exp_hash_cohort",
        )

        with pytest.raises(ValueError):
            # Request time outside of available intervals
            table_generator.generate_entity_date_table(
                [datetime(2015, 12, 31)])

        (cohort_count, ) = engine.execute(f"""\
            select count(*) from {table_generator.entity_date_table_name}
        """).first()

        assert cohort_count == 0

        engine.dispose()
Example #2
0
 def generate_entity_date_table(self, as_of_date, entity_date_table_name):
     cohort_table_generator = EntityDateTableGenerator(
         db_engine=self.db_engine,
         query=self.experiment_config['cohort_config']['query'],
         entity_date_table_name=entity_date_table_name)
     cohort_table_generator.generate_entity_date_table(
         as_of_dates=[dt_from_str(as_of_date)])
def test_entity_date_table_generator_from_labels():
    labels_data = [
        (1, datetime(2016, 1, 1), timedelta(180), 'outcome', 'binary', 0),
        (1, datetime(2016, 4, 1), timedelta(180), 'outcome', 'binary', 1),
        (1, datetime(2016, 3, 1), timedelta(180), 'outcome', 'binary', 0),
        (2, datetime(2016, 1, 1), timedelta(180), 'outcome', 'binary', 0),
        (2, datetime(2016, 1, 1), timedelta(180), 'outcome', 'binary', 1),
        (3, datetime(2016, 1, 1), timedelta(180), 'outcome', 'binary', 0),
        (5, datetime(2016, 3, 1), timedelta(180), 'outcome', 'binary', 0),
        (5, datetime(2016, 4, 1), timedelta(180), 'outcome', 'binary', 1),
        (1, datetime(2016, 1, 1), timedelta(90), 'outcome', 'binary', 0),
        (1, datetime(2016, 4, 1), timedelta(90), 'outcome', 'binary', 0),
        (1, datetime(2016, 3, 1), timedelta(90), 'outcome', 'binary', 1),
        (2, datetime(2016, 1, 1), timedelta(90), 'outcome', 'binary', 0),
        (2, datetime(2016, 1, 1), timedelta(90), 'outcome', 'binary', 1),
        (3, datetime(2016, 1, 1), timedelta(90), 'outcome', 'binary', 0),
        (5, datetime(2016, 3, 1), timedelta(90), 'outcome', 'binary', 0),
        (5, datetime(2016, 4, 1), timedelta(90), 'outcome', 'binary', 0),
    ]
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        labels_table_name = utils.create_labels(engine, labels_data)
        table_generator = EntityDateTableGenerator(
            query=None,
            labels_table_name=labels_table_name,
            db_engine=engine,
            entity_date_table_name="exp_hash_entity_date",
            replace=False)
        table_generator.generate_entity_date_table([])
        expected_output = [
            (1, datetime(2016, 1, 1)),
            (1, datetime(2016, 3, 1)),
            (1, datetime(2016, 4, 1)),
            (2, datetime(2016, 1, 1)),
            (3, datetime(2016, 1, 1)),
            (5, datetime(2016, 3, 1)),
            (5, datetime(2016, 4, 1)),
        ]
        results = list(
            engine.execute(f"""
                select entity_id, as_of_date from {table_generator.entity_date_table_name}
                order by entity_id, as_of_date
            """))
        assert results == expected_output
Example #4
0
 def generate_tasks(self, subset_configs):
     logging.info("Generating subset table creation tasks")
     subset_tasks = []
     for subset_config in subset_configs:
         if subset_config:
             subset_hash = filename_friendly_hash(subset_config)
             subset_table_generator = EntityDateTableGenerator(
                 entity_date_table_name=get_subset_table_name(subset_config),
                 db_engine=self.db_engine,
                 query=subset_config["query"],
                 replace=self.replace
             )
             subset_tasks.append(
                 {
                     "subset_config": subset_config,
                     "subset_hash": subset_hash,
                     "subset_table_generator": subset_table_generator,
                 }
             )
     return subset_tasks
Example #5
0
    def __call__(self, args):
        self.root.setup()  # Loading configuration (if exists)
        db_engine = create_engine(self.root.db_url)
        full_config = yaml.full_load(args.feature_config_file)
        feature_config = full_config['feature_aggregations']
        cohort_config = full_config.get('cohort_config', None)
        if cohort_config:
            EntityDateTableGenerator(
                entity_date_table_name="features_test.test_cohort",
                db_engine=db_engine,
                query=cohort_config["query"],
                replace=True).generate_entity_date_table(
                    as_of_dates=[args.as_of_date])

        FeatureGenerator(db_engine,
                         "features_test").create_features_before_imputation(
                             feature_aggregation_config=feature_config,
                             feature_dates=[args.as_of_date],
                             state_table="features_test.test_cohort")
        logger.success(
            f"Features created for feature_config {feature_config} and date {args.as_of_date}"
        )
Example #6
0
    def initialize_components(self):
        split_config = self.config["temporal_config"]

        self.chopper = Timechop(**split_config)

        cohort_config = self.config.get("cohort_config", {})
        if "query" in cohort_config:
            self.cohort_table_name = "cohort_{}_{}".format(
                cohort_config.get('name', 'default'), self.cohort_hash)
            self.cohort_table_generator = EntityDateTableGenerator(
                entity_date_table_name=self.cohort_table_name,
                db_engine=self.db_engine,
                query=cohort_config["query"],
                replace=self.replace)
        else:
            logging.warning(
                "cohort_config missing or unrecognized. Without a cohort, "
                "you will not be able to make matrices, perform feature imputation, "
                "or save time by only computing features for that cohort.")
            self.features_ignore_cohort = True
            self.cohort_table_name = "cohort_{}".format(self.experiment_hash)
            self.cohort_table_generator = EntityDateTableGeneratorNoOp()

        self.subsets = [None] + self.config.get("scoring", {}).get(
            "subsets", [])

        if "label_config" in self.config:
            label_config = self.config["label_config"]
            self.labels_table_name = "labels_{}_{}".format(
                label_config.get('name', 'default'),
                filename_friendly_hash(label_config['query']))
            self.label_generator = LabelGenerator(
                label_name=label_config.get("name", None),
                query=label_config["query"],
                replace=self.replace,
                db_engine=self.db_engine,
            )
        else:
            self.labels_table_name = "labels_{}".format(self.experiment_hash)
            self.label_generator = LabelGeneratorNoOp()
            logging.warning(
                "label_config missing or unrecognized. Without labels, "
                "you will not be able to make matrices.")

        if "bias_audit_config" in self.config:
            bias_config = self.config["bias_audit_config"]
            self.bias_hash = filename_friendly_hash(bias_config)
            self.protected_groups_table_name = f"protected_groups_{self.bias_hash}"
            self.protected_groups_generator = ProtectedGroupsGenerator(
                db_engine=self.db_engine,
                from_obj=parse_from_obj(bias_config, 'bias_from_obj'),
                attribute_columns=bias_config.get("attribute_columns", None),
                entity_id_column=bias_config.get("entity_id_column", None),
                knowledge_date_column=bias_config.get("knowledge_date_column",
                                                      None),
                protected_groups_table_name=self.protected_groups_table_name,
                replace=self.replace)
        else:
            self.protected_groups_generator = ProtectedGroupsGeneratorNoOp()
            logging.warning(
                "bias_audit_config missing or unrecognized. Without protected groups, "
                "you will not audit your models for bias and fairness.")

        self.feature_dictionary_creator = FeatureDictionaryCreator(
            features_schema_name=self.features_schema_name,
            db_engine=self.db_engine)

        self.feature_generator = FeatureGenerator(
            features_schema_name=self.features_schema_name,
            replace=self.replace,
            db_engine=self.db_engine,
            feature_start_time=split_config["feature_start_time"],
            materialize_subquery_fromobjs=self.materialize_subquery_fromobjs,
            features_ignore_cohort=self.features_ignore_cohort)

        self.feature_group_creator = FeatureGroupCreator(
            self.config.get("feature_group_definition", {"all": [True]}))

        self.feature_group_mixer = FeatureGroupMixer(
            self.config.get("feature_group_strategies", ["all"]))

        self.planner = Planner(
            feature_start_time=dt_from_str(split_config["feature_start_time"]),
            label_names=[
                self.config.get("label_config",
                                {}).get("name", DEFAULT_LABEL_NAME)
            ],
            label_types=["binary"],
            cohort_names=[
                self.config.get("cohort_config", {}).get("name", None)
            ],
            user_metadata=self.config.get("user_metadata", {}),
        )

        self.matrix_builder = MatrixBuilder(
            db_config={
                "features_schema_name": self.features_schema_name,
                "labels_schema_name": "public",
                "labels_table_name": self.labels_table_name,
                "cohort_table_name": self.cohort_table_name,
            },
            matrix_storage_engine=self.matrix_storage_engine,
            experiment_hash=self.experiment_hash,
            include_missing_labels_in_train_as=self.config.get(
                "label_config", {}).get("include_missing_labels_in_train_as",
                                        None),
            engine=self.db_engine,
            replace=self.replace,
            run_id=self.run_id,
        )

        self.subsetter = Subsetter(db_engine=self.db_engine,
                                   replace=self.replace,
                                   as_of_times=self.all_as_of_times)

        self.trainer = ModelTrainer(
            experiment_hash=self.experiment_hash,
            model_storage_engine=self.model_storage_engine,
            model_grouper=ModelGrouper(self.config.get("model_group_keys",
                                                       [])),
            db_engine=self.db_engine,
            replace=self.replace,
            run_id=self.run_id,
        )

        self.predictor = Predictor(
            db_engine=self.db_engine,
            model_storage_engine=self.model_storage_engine,
            save_predictions=self.save_predictions,
            replace=self.replace,
            rank_order=self.config.get("prediction",
                                       {}).get("rank_tiebreaker", "worst"),
        )

        self.individual_importance_calculator = IndividualImportanceCalculator(
            db_engine=self.db_engine,
            n_ranks=self.config.get("individual_importance",
                                    {}).get("n_ranks", 5),
            methods=self.config.get("individual_importance",
                                    {}).get("methods", ["uniform"]),
            replace=self.replace,
        )

        self.evaluator = ModelEvaluator(
            db_engine=self.db_engine,
            testing_metric_groups=self.config.get("scoring", {}).get(
                "testing_metric_groups", []),
            training_metric_groups=self.config.get("scoring", {}).get(
                "training_metric_groups", []),
            bias_config=self.config.get("bias_audit_config", {}))

        self.model_train_tester = ModelTrainTester(
            matrix_storage_engine=self.matrix_storage_engine,
            model_evaluator=self.evaluator,
            model_trainer=self.trainer,
            individual_importance_calculator=self.
            individual_importance_calculator,
            predictor=self.predictor,
            subsets=self.subsets,
            protected_groups_generator=self.protected_groups_generator,
            cohort_hash=self.cohort_hash)
def test_entity_date_table_generator_replace():
    input_data = [
        (1, datetime(2016, 1, 1), True),
        (1, datetime(2016, 4, 1), False),
        (1, datetime(2016, 3, 1), True),
        (2, datetime(2016, 1, 1), False),
        (2, datetime(2016, 1, 1), True),
        (3, datetime(2016, 1, 1), True),
        (5, datetime(2016, 3, 1), True),
        (5, datetime(2016, 4, 1), True),
    ]
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        utils.create_binary_outcome_events(engine, "events", input_data)
        table_generator = EntityDateTableGenerator(
            query=
            "select entity_id from events where outcome_date < '{as_of_date}'::date",
            db_engine=engine,
            entity_date_table_name="exp_hash_entity_date",
            replace=True)
        as_of_dates = [
            datetime(2016, 1, 1),
            datetime(2016, 2, 1),
            datetime(2016, 3, 1),
            datetime(2016, 4, 1),
            datetime(2016, 5, 1),
            datetime(2016, 6, 1),
        ]
        table_generator.generate_entity_date_table(as_of_dates)
        expected_output = [
            (1, datetime(2016, 2, 1), True),
            (1, datetime(2016, 3, 1), True),
            (1, datetime(2016, 4, 1), True),
            (1, datetime(2016, 5, 1), True),
            (1, datetime(2016, 6, 1), True),
            (2, datetime(2016, 2, 1), True),
            (2, datetime(2016, 3, 1), True),
            (2, datetime(2016, 4, 1), True),
            (2, datetime(2016, 5, 1), True),
            (2, datetime(2016, 6, 1), True),
            (3, datetime(2016, 2, 1), True),
            (3, datetime(2016, 3, 1), True),
            (3, datetime(2016, 4, 1), True),
            (3, datetime(2016, 5, 1), True),
            (3, datetime(2016, 6, 1), True),
            (5, datetime(2016, 4, 1), True),
            (5, datetime(2016, 5, 1), True),
            (5, datetime(2016, 6, 1), True),
        ]
        results = list(
            engine.execute(f"""
                select entity_id, as_of_date, active from {table_generator.entity_date_table_name}
                order by entity_id, as_of_date
            """))
        assert results == expected_output
        utils.assert_index(engine, table_generator.entity_date_table_name,
                           "entity_id")
        utils.assert_index(engine, table_generator.entity_date_table_name,
                           "as_of_date")

        table_generator.generate_entity_date_table(as_of_dates)
        assert results == expected_output
def test_entity_date_table_generator_noreplace():
    input_data = [
        (1, datetime(2016, 1, 1), True),
        (1, datetime(2016, 4, 1), False),
        (1, datetime(2016, 3, 1), True),
        (2, datetime(2016, 1, 1), False),
        (2, datetime(2016, 1, 1), True),
        (3, datetime(2016, 1, 1), True),
        (5, datetime(2016, 3, 1), True),
        (5, datetime(2016, 4, 1), True),
    ]
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        utils.create_binary_outcome_events(engine, "events", input_data)
        table_generator = EntityDateTableGenerator(
            query=
            "select entity_id from events where outcome_date < '{as_of_date}'::date",
            db_engine=engine,
            entity_date_table_name="exp_hash_entity_date",
            replace=False)

        # 1. generate a cohort for a subset of as-of-dates
        as_of_dates = [
            datetime(2016, 1, 1),
            datetime(2016, 2, 1),
            datetime(2016, 3, 1),
        ]
        table_generator.generate_entity_date_table(as_of_dates)
        expected_output = [
            (1, datetime(2016, 2, 1), True),
            (1, datetime(2016, 3, 1), True),
            (2, datetime(2016, 2, 1), True),
            (2, datetime(2016, 3, 1), True),
            (3, datetime(2016, 2, 1), True),
            (3, datetime(2016, 3, 1), True),
        ]
        results = list(
            engine.execute(f"""
                select entity_id, as_of_date, active from {table_generator.entity_date_table_name}
                order by entity_id, as_of_date
            """))
        assert results == expected_output
        utils.assert_index(engine, table_generator.entity_date_table_name,
                           "entity_id")
        utils.assert_index(engine, table_generator.entity_date_table_name,
                           "as_of_date")

        table_generator.generate_entity_date_table(as_of_dates)
        assert results == expected_output

        # 2. generate a cohort for a different subset of as-of-dates,
        # actually including an overlap to make sure that it doesn't double-insert anything
        as_of_dates = [
            datetime(2016, 3, 1),
            datetime(2016, 4, 1),
            datetime(2016, 5, 1),
            datetime(2016, 6, 1),
        ]
        table_generator.generate_entity_date_table(as_of_dates)
        expected_output = [
            (1, datetime(2016, 2, 1), True),
            (1, datetime(2016, 3, 1), True),
            (1, datetime(2016, 4, 1), True),
            (1, datetime(2016, 5, 1), True),
            (1, datetime(2016, 6, 1), True),
            (2, datetime(2016, 2, 1), True),
            (2, datetime(2016, 3, 1), True),
            (2, datetime(2016, 4, 1), True),
            (2, datetime(2016, 5, 1), True),
            (2, datetime(2016, 6, 1), True),
            (3, datetime(2016, 2, 1), True),
            (3, datetime(2016, 3, 1), True),
            (3, datetime(2016, 4, 1), True),
            (3, datetime(2016, 5, 1), True),
            (3, datetime(2016, 6, 1), True),
            (5, datetime(2016, 4, 1), True),
            (5, datetime(2016, 5, 1), True),
            (5, datetime(2016, 6, 1), True),
        ]
        results = list(
            engine.execute(f"""
                select entity_id, as_of_date, active from {table_generator.entity_date_table_name}
                order by entity_id, as_of_date
            """))
        assert results == expected_output
Example #9
0
def basic_integration_test(
    cohort_names,
    feature_group_create_rules,
    feature_group_mix_rules,
    expected_matrix_multiplier,
    expected_group_lists,
):
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        Base.metadata.create_all(db_engine)
        populate_source_data(db_engine)

        with TemporaryDirectory() as temp_dir:
            chopper = Timechop(
                feature_start_time=datetime(2010, 1, 1),
                feature_end_time=datetime(2014, 1, 1),
                label_start_time=datetime(2011, 1, 1),
                label_end_time=datetime(2014, 1, 1),
                model_update_frequency="1year",
                training_label_timespans=["6months"],
                test_label_timespans=["6months"],
                training_as_of_date_frequencies="1day",
                test_as_of_date_frequencies="3months",
                max_training_histories=["1months"],
                test_durations=["1months"],
            )

            entity_date_table_generator = EntityDateTableGenerator(
                db_engine=db_engine,
                entity_date_table_name="cohort_abcd",
                query="select distinct(entity_id) from events")

            label_generator = LabelGenerator(
                db_engine=db_engine,
                query=sample_config()["label_config"]["query"])

            feature_generator = FeatureGenerator(
                db_engine=db_engine,
                features_schema_name="features",
                replace=True)

            feature_dictionary_creator = FeatureDictionaryCreator(
                db_engine=db_engine, features_schema_name="features")

            feature_group_creator = FeatureGroupCreator(
                feature_group_create_rules)

            feature_group_mixer = FeatureGroupMixer(feature_group_mix_rules)
            project_storage = ProjectStorage(temp_dir)
            planner = Planner(
                feature_start_time=datetime(2010, 1, 1),
                label_names=["outcome"],
                label_types=["binary"],
                cohort_names=cohort_names,
                user_metadata={},
            )

            builder = MatrixBuilder(
                engine=db_engine,
                db_config={
                    "features_schema_name": "features",
                    "labels_schema_name": "public",
                    "labels_table_name": "labels",
                    "cohort_table_name": "cohort_abcd",
                },
                experiment_hash=None,
                matrix_storage_engine=project_storage.matrix_storage_engine(),
                replace=True,
            )

            # chop time
            split_definitions = chopper.chop_time()
            num_split_matrices = sum(1 + len(split["test_matrices"])
                                     for split in split_definitions)

            # generate as_of_times for feature/label/state generation
            all_as_of_times = []
            for split in split_definitions:
                all_as_of_times.extend(split["train_matrix"]["as_of_times"])
                for test_matrix in split["test_matrices"]:
                    all_as_of_times.extend(test_matrix["as_of_times"])
            all_as_of_times = list(set(all_as_of_times))

            # generate entity_date state table
            entity_date_table_generator.generate_entity_date_table(
                as_of_dates=all_as_of_times)

            # create labels table
            label_generator.generate_all_labels(
                labels_table="labels",
                as_of_dates=all_as_of_times,
                label_timespans=["6months"],
            )

            # create feature table tasks
            # we would use FeatureGenerator#create_all_tables but want to use
            # the tasks dict directly to create a feature dict
            aggregations = feature_generator.aggregations(
                feature_aggregation_config=[
                    {
                        "prefix":
                        "cat",
                        "from_obj":
                        "cat_complaints",
                        "knowledge_date_column":
                        "as_of_date",
                        "aggregates": [{
                            "quantity": "cat_sightings",
                            "metrics": ["count", "avg"],
                            "imputation": {
                                "all": {
                                    "type": "mean"
                                }
                            },
                        }],
                        "intervals": ["1y"],
                        "groups": ["entity_id"],
                    },
                    {
                        "prefix":
                        "dog",
                        "from_obj":
                        "dog_complaints",
                        "knowledge_date_column":
                        "as_of_date",
                        "aggregates_imputation": {
                            "count": {
                                "type": "constant",
                                "value": 7
                            },
                            "sum": {
                                "type": "mean"
                            },
                            "avg": {
                                "type": "zero"
                            },
                        },
                        "aggregates": [{
                            "quantity": "dog_sightings",
                            "metrics": ["count", "avg"]
                        }],
                        "intervals": ["1y"],
                        "groups": ["entity_id"],
                    },
                ],
                feature_dates=all_as_of_times,
                state_table=entity_date_table_generator.entity_date_table_name,
            )
            feature_table_agg_tasks = feature_generator.generate_all_table_tasks(
                aggregations, task_type="aggregation")

            # create feature aggregation tables
            feature_generator.process_table_tasks(feature_table_agg_tasks)

            feature_table_imp_tasks = feature_generator.generate_all_table_tasks(
                aggregations, task_type="imputation")

            # create feature imputation tables
            feature_generator.process_table_tasks(feature_table_imp_tasks)

            # build feature dictionaries from feature tables and
            # subsetting config
            master_feature_dict = feature_dictionary_creator.feature_dictionary(
                feature_table_names=feature_table_imp_tasks.keys(),
                index_column_lookup=feature_generator.index_column_lookup(
                    aggregations),
            )

            feature_dicts = feature_group_mixer.generate(
                feature_group_creator.subsets(master_feature_dict))

            # figure out what matrices need to be built
            _, matrix_build_tasks = planner.generate_plans(
                split_definitions, feature_dicts)

            # go and build the matrices
            builder.build_all_matrices(matrix_build_tasks)

            # super basic assertion: did matrices we expect get created?
            matrices_records = list(
                db_engine.execute(
                    """select matrix_uuid, num_observations, matrix_type
                    from triage_metadata.matrices
                    """))
            matrix_directory = os.path.join(temp_dir, "matrices")
            matrices = [
                path for path in os.listdir(matrix_directory) if ".csv" in path
            ]
            metadatas = [
                path for path in os.listdir(matrix_directory)
                if ".yaml" in path
            ]
            assert len(matrices) == num_split_matrices * \
                expected_matrix_multiplier
            assert len(metadatas) == num_split_matrices * \
                expected_matrix_multiplier
            assert len(matrices) == len(matrices_records)
            feature_group_name_lists = []
            for metadata_path in metadatas:
                with open(os.path.join(matrix_directory, metadata_path)) as f:
                    metadata = yaml.full_load(f)
                    feature_group_name_lists.append(metadata["feature_groups"])

            for matrix_uuid, num_observations, matrix_type in matrices_records:
                assert matrix_uuid in matrix_build_tasks  # the hashes of the matrices
                assert type(num_observations) is int
                assert matrix_type == matrix_build_tasks[matrix_uuid][
                    "matrix_type"]

            def deep_unique_tuple(l):
                return set([tuple(i) for i in l])

            assert deep_unique_tuple(
                feature_group_name_lists) == deep_unique_tuple(
                    expected_group_lists)
Example #10
0
class ExperimentBase(ABC):
    """The base class for all Experiments.

    Subclasses must implement the following four methods:
    process_query_tasks
    process_matrix_build_tasks
    process_subset_tasks
    process_train_test_batches

    Look at singlethreaded.py for reference implementation of each.

    Args:
        config (dict)
        db_engine (triage.util.db.SerializableDbEngine or sqlalchemy.engine.Engine)
        project_path (string)
        replace (bool)
        cleanup_timeout (int)
        materialize_subquery_fromobjs (bool, default True) Whether or not to create and index
            tables for feature "from objects" that are subqueries. Can speed up performance
            when building features for many as-of-dates.
        additional_bigtrain_classnames (list) Any additional class names to perform in the second batch
            of training, which focuses on large modeling algorithms that tend to run with less parallelization
            as there is generally parallelization and high memory requirements built into the algorithm.
        profile (bool)
    """

    cleanup_timeout = 60  # seconds

    def __init__(
        self,
        config,
        db_engine,
        project_path=None,
        matrix_storage_class=CSVMatrixStore,
        replace=True,
        cleanup=False,
        cleanup_timeout=None,
        materialize_subquery_fromobjs=True,
        features_ignore_cohort=False,
        additional_bigtrain_classnames=None,
        profile=False,
        save_predictions=True,
        skip_validation=False,
        partial_run=False,
    ):
        # For a partial run, skip validation and avoid cleaning up
        # we'll also skip filling default config values below
        if partial_run:
            cleanup = False
            skip_validation = True

        experiment_kwargs = bind_kwargs(
            self.__class__,
            **{
                key: value
                for (key, value) in locals().items()
                if key not in {"db_engine", "config", "self"}
            },
        )

        self._check_config_version(config)
        self.config = config

        if self.config.get("cohort_config") is not None:
            self.config["cohort_config"] = load_query_if_needed(
                self.config["cohort_config"]
            )
        if self.config.get("label_config") is not None:
            self.config["label_config"] = load_query_if_needed(
                self.config["label_config"]
            )

        self.project_storage = ProjectStorage(project_path)
        self.model_storage_engine = ModelStorageEngine(self.project_storage)
        self.matrix_storage_engine = MatrixStorageEngine(
            self.project_storage, matrix_storage_class
        )
        self.project_path = project_path
        logger.verbose(
            f"Matrices and trained models will be saved in {self.project_path}"
        )
        self.replace = replace
        if self.replace:
            logger.notice(
                f"Replace flag is set to true. Matrices, models, "
                "evaluations and predictions (if they exist) will be replaced"
            )

        self.save_predictions = save_predictions
        if not self.save_predictions:
            logger.notice(
                f"Save predictions flag is set to false. "
                "Individual predictions won't be stored in the predictions "
                "table. This will decrease both the running time "
                "of an experiment and also decrease the space needed in the db"
            )

        self.skip_validation = skip_validation
        if self.skip_validation:
            logger.notice(
                f"Warning: Skip validation flag is set to true. "
                "The experiment config file specified won't be validated. "
                "This will reduce (a little) the running time of the experiment, "
                "but has some potential risks, e.g. the experiment could fail"
                "after some time due to some misconfiguration. Proceed with care."
            )

        self.db_engine = db_engine
        results_schema.upgrade_if_clean(dburl=self.db_engine.url)

        self.features_schema_name = "features"

        self.materialize_subquery_fromobjs = materialize_subquery_fromobjs
        if not self.materialize_subquery_fromobjs:
            logger.notice(
                "Materialize from_objs is set to false. "
                "The from_objs will be calculated on the fly every time."
            )

        self.features_ignore_cohort = features_ignore_cohort
        if self.features_ignore_cohort:
            logger.notice(
                "Features will be calculated for all the entities "
                "(i.e. ignoring cohort) this setting will have the effect "
                "that more db space will be used, but potentially could save "
                "time if you are running several similar experiments with "
                "different cohorts."
            )

        self.additional_bigtrain_classnames = additional_bigtrain_classnames
        # only fill default values for full runs
        if not partial_run:
            ## Defaults to sane values
            self.config["temporal_config"] = fill_timechop_config_missing(
                self.config, self.db_engine
            )
            ## Defaults to all the entities found in the features_aggregation's from_obj
            self.config["cohort_config"] = fill_cohort_config_missing(self.config)
            ## Defaults to all the feature_aggregation's prefixes
            self.config["feature_group_definition"] = fill_feature_group_definition(
                self.config
            )

        grid_config = fill_model_grid_presets(self.config)
        self.config.pop("model_grid_preset", None)
        if grid_config is not None:
            self.config["grid_config"] = grid_config

        if not self.config.get("random_seed", None):
            logger.notice(
                "Random seed not specified. A random seed will be provided. "
                "This could have interesting side effects, "
                "e.g. new models per model group are trained, "
                "tested and evaluated everytime that you run this experiment configuration"
            )

        self.random_seed = self.config.pop("random_seed", random.randint(1, 1e7))

        logger.verbose(
            f"Using random seed [{self.random_seed}] for running the experiment"
        )
        random.seed(self.random_seed)

        ###################### RUBICON ######################

        self.experiment_hash = save_experiment_and_get_hash(self.config, self.db_engine)
        logger.debug(f"Experiment hash [{self.experiment_hash}] assigned")
        self.run_id = initialize_tracking_and_get_run_id(
            self.experiment_hash,
            experiment_class_path=classpath(self.__class__),
            random_seed=self.random_seed,
            experiment_kwargs=experiment_kwargs,
            db_engine=self.db_engine,
        )
        logger.debug(f"Experiment run id [{self.run_id}] assigned")

        self.initialize_components()

        self.cleanup = cleanup
        if self.cleanup:
            logger.notice(
                "Cleanup is set to true, so intermediate tables (labels and cohort) "
                "will be removed after matrix creation and subset tables will be "
                "removed after model training and testing"
            )

        self.cleanup_timeout = (
            self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout
        )

        self.profile = profile
        if self.profile:
            logger.spam("Profiling will be stored using cProfile")

    def _check_config_version(self, config):
        if "config_version" in config:
            config_version = config["config_version"]
        else:
            raise ValueError("config_version key not found in experiment config. ")
            config_version = "v1"
        if config_version != CONFIG_VERSION:
            raise ValueError(
                "Experiment config '{}' "
                "does not match current version '{}'. "
                "Will not run experiment.".format(config_version, CONFIG_VERSION)
            )

    def initialize_components(self):
        split_config = self.config["temporal_config"]

        self.chopper = Timechop(**split_config)

        if "label_config" in self.config:
            label_config = self.config["label_config"]
            self.labels_table_name = "labels_{}_{}".format(
                label_config.get("name", "default"),
                filename_friendly_hash(label_config["query"]),
            )
            self.label_generator = LabelGenerator(
                label_name=label_config.get("name", None),
                query=label_config["query"],
                replace=self.replace,
                db_engine=self.db_engine,
            )
        else:
            self.labels_table_name = "labels_{}".format(self.experiment_hash)
            self.label_generator = LabelGeneratorNoOp()
            logger.warning(
                "label_config missing or unrecognized. Without labels, "
                "you will not be able to make matrices."
            )
        record_labels_table_name(self.run_id, self.db_engine, self.labels_table_name)

        cohort_config = self.config.get("cohort_config", {})
        self.cohort_table_generator = None
        if "query" in cohort_config:
            self.cohort_hash = filename_friendly_hash(
                self.config["cohort_config"]["query"]
            )
        elif "query" in self.config.get("label_config", {}):
            logger.info(
                "cohort_config missing or unrecognized, but labels are configured. Labels will be used as the cohort."
            )
            self.cohort_hash = filename_friendly_hash(
                self.config["label_config"]["query"]
            )
        else:
            self.features_ignore_cohort = True
            self.cohort_hash = None
            self.cohort_table_name = "cohort_{}".format(self.experiment_hash)
            self.cohort_table_generator = CohortTableGeneratorNoOp()

        if not self.cohort_table_generator:
            self.cohort_table_name = "cohort_{}_{}".format(
                cohort_config.get("name", "default"), self.cohort_hash
            )
            self.cohort_table_generator = EntityDateTableGenerator(
                entity_date_table_name=self.cohort_table_name,
                db_engine=self.db_engine,
                query=cohort_config.get("query", None),
                labels_table_name=self.labels_table_name,
                replace=self.replace,
            )

        record_cohort_table_name(self.run_id, self.db_engine, self.cohort_table_name)

        if "bias_audit_config" in self.config:
            bias_config = self.config["bias_audit_config"]
            self.bias_hash = filename_friendly_hash(bias_config)
            self.protected_groups_table_name = f"protected_groups_{self.bias_hash}"
            self.protected_groups_generator = ProtectedGroupsGenerator(
                db_engine=self.db_engine,
                from_obj=parse_from_obj(bias_config, "bias_from_obj"),
                attribute_columns=bias_config.get("attribute_columns", None),
                entity_id_column=bias_config.get("entity_id_column", None),
                knowledge_date_column=bias_config.get("knowledge_date_column", None),
                protected_groups_table_name=self.protected_groups_table_name,
                replace=self.replace,
            )
            record_bias_hash(self.run_id, self.db_engine, self.bias_hash)
        else:
            self.protected_groups_generator = ProtectedGroupsGeneratorNoOp()
            logger.notice(
                "bias_audit_config missing in the configuration file or unrecognized. "
                "Without protected groups, you will not be able to audit your models for bias and fairness."
            )

        self.feature_dictionary_creator = FeatureDictionaryCreator(
            features_schema_name=self.features_schema_name, db_engine=self.db_engine
        )

        self.feature_generator = FeatureGenerator(
            features_schema_name=self.features_schema_name,
            replace=self.replace,
            db_engine=self.db_engine,
            feature_start_time=split_config["feature_start_time"],
            materialize_subquery_fromobjs=self.materialize_subquery_fromobjs,
            features_ignore_cohort=self.features_ignore_cohort,
        )

        self.feature_group_creator = FeatureGroupCreator(
            self.config.get("feature_group_definition", {"all": [True]})
        )

        self.feature_group_mixer = FeatureGroupMixer(
            self.config.get("feature_group_strategies", ["all"])
        )

        self.planner = Planner(
            feature_start_time=dt_from_str(split_config["feature_start_time"]),
            label_names=[
                self.config.get("label_config", {}).get("name", DEFAULT_LABEL_NAME)
            ],
            label_types=["binary"],
            cohort_names=[self.config.get("cohort_config", {}).get("name", None)],
            user_metadata=self.config.get("user_metadata", {}),
        )

        self.matrix_builder = MatrixBuilder(
            db_config={
                "features_schema_name": self.features_schema_name,
                "labels_schema_name": "public",
                "labels_table_name": self.labels_table_name,
                "cohort_table_name": self.cohort_table_name,
            },
            matrix_storage_engine=self.matrix_storage_engine,
            experiment_hash=self.experiment_hash,
            include_missing_labels_in_train_as=self.config.get("label_config", {}).get(
                "include_missing_labels_in_train_as", None
            ),
            engine=self.db_engine,
            replace=self.replace,
            run_id=self.run_id,
        )

        self.subsets = self.config.get("scoring", {}).get("subsets", [])
        if self.subsets:
            self.subsetter = Subsetter(
                db_engine=self.db_engine,
                replace=self.replace,
                as_of_times=self.all_as_of_times,
            )
        else:
            self.subsetter = SubsetterNoOp()
            logger.notice(
                "scoring.subsets missing in the configuration file or unrecognized. No subsets will be generated"
            )

        self.trainer = ModelTrainer(
            experiment_hash=self.experiment_hash,
            model_storage_engine=self.model_storage_engine,
            model_grouper=ModelGrouper(self.config.get("model_group_keys", [])),
            db_engine=self.db_engine,
            replace=self.replace,
            run_id=self.run_id,
        )

        self.predictor = Predictor(
            db_engine=self.db_engine,
            model_storage_engine=self.model_storage_engine,
            save_predictions=self.save_predictions,
            replace=self.replace,
            rank_order=self.config.get("prediction", {}).get(
                "rank_tiebreaker", "worst"
            ),
        )

        if "individual_importance" in self.config:
            self.individual_importance_calculator = IndividualImportanceCalculator(
                db_engine=self.db_engine,
                n_ranks=self.config.get("individual_importance", {}).get("n_ranks", 5),
                methods=self.config.get("individual_importance", {}).get(
                    "methods", ["uniform"]
                ),
                replace=self.replace,
            )
        else:
            self.individual_importance_calculator = IndividualImportanceCalculatorNoOp()
            logger.notice(
                "individual_importance missing in the configuration file or unrecognized, "
                "you will not be able to do analysis on individual feature importances."
            )

        self.evaluator = ModelEvaluator(
            db_engine=self.db_engine,
            testing_metric_groups=self.config.get("scoring", {}).get(
                "testing_metric_groups", []
            ),
            training_metric_groups=self.config.get("scoring", {}).get(
                "training_metric_groups", []
            ),
            bias_config=self.config.get("bias_audit_config", {}),
        )

        self.model_train_tester = ModelTrainTester(
            matrix_storage_engine=self.matrix_storage_engine,
            model_evaluator=self.evaluator,
            model_trainer=self.trainer,
            individual_importance_calculator=self.individual_importance_calculator,
            predictor=self.predictor,
            subsets=self.subsets,
            protected_groups_generator=self.protected_groups_generator,
            cohort_hash=self.cohort_hash,
            replace=self.replace,
            additional_bigtrain_classnames=self.additional_bigtrain_classnames,
        )

    def get_for_update(self):
        return get_for_update(
            self.db_engine, results_schema.Experiment, self.experiment_hash
        )

    @cachedproperty
    def split_definitions(self):
        """Temporal splits based on the experiment's configuration

        Returns: (dict) temporal splits

        Example:
        ```
        {
            'feature_start_time': {datetime},
            'feature_end_time': {datetime},
            'label_start_time': {datetime},
            'label_end_time': {datetime},
            'train_matrix': {
                'first_as_of_time': {datetime},
                'last_as_of_time': {datetime},
                'matrix_info_end_time': {datetime},
                'training_label_timespan': {str},
                'training_as_of_date_frequency': {str},
                'max_training_history': {str},
                'as_of_times': [list of {datetime}s]
            },
            'test_matrices': [list of matrix defs similar to train_matrix]
        }
        ```

        (When updating/setting split definitions, matrices should have
        UUIDs.)

        """
        split_definitions = self.chopper.chop_time()
        logger.verbose(f"Computed and stored temporal split definitions")
        logger.debug(f"Temporal split definitions: {split_definitions}")
        logger.spam("\n----TIME SPLIT SUMMARY----\n")
        logger.spam("Number of time splits: {len(split_definitions)}")
        for split_index, split in enumerate(split_definitions):
            train_times = split["train_matrix"]["as_of_times"]
            test_times = [
                as_of_time
                for test_matrix in split["test_matrices"]
                for as_of_time in test_matrix["as_of_times"]
            ]
            logger.spam(
                f"""Split index {split_index}:"""
                f"""Training as_of_time_range: {min(train_times)} to {max(train_times)} ({len(train_times)} total)"""
                f"""Testing as_of_time range: {min(test_times)} to {max(test_times)} ({len(test_times)} total)\n\n"""
            )

        with self.get_for_update() as experiment:
            experiment.time_splits = len(split_definitions)
        return split_definitions

    @cachedproperty
    def all_as_of_times(self):
        """All 'as of times' in experiment config

        Used for label and feature generation.

        Returns: (list) of datetimes

        """
        logger.spam("Calculating all the as_of_times")
        all_as_of_times = []
        for split in self.split_definitions:
            all_as_of_times.extend(split["train_matrix"]["as_of_times"])
            logger.spam(
                f'Adding as_of_times from train matrix: {split["train_matrix"]["as_of_times"]}'
            )
            for test_matrix in split["test_matrices"]:
                logger.spam(
                    f'Adding as_of_times from test matrix: {test_matrix["as_of_times"]}',
                )
                all_as_of_times.extend(test_matrix["as_of_times"])

        logger.spam(
            f"Computed {len(all_as_of_times)} total as_of_times for label and feature generation",
        )
        distinct_as_of_times = list(set(all_as_of_times))
        logger.debug(
            f"Computed {len(distinct_as_of_times)} distinct as_of_times for label and feature generation",
        )
        logger.spam(
            "You can view all as_of_times by inspecting `.all_as_of_times` on this Experiment"
        )
        with self.get_for_update() as experiment:
            experiment.as_of_times = len(distinct_as_of_times)
        return distinct_as_of_times

    @cachedproperty
    def collate_aggregations(self):
        """Collation of ``Aggregation`` objects used by this experiment.

        Returns: (list) of ``collate.Aggregation`` objects

        """
        logger.info("Creating collate aggregations")
        if "feature_aggregations" not in self.config:
            logger.warning("No feature_aggregation config is available")
            return []
        aggregations = self.feature_generator.aggregations(
            feature_aggregation_config=self.config["feature_aggregations"],
            feature_dates=self.all_as_of_times,
            state_table=self.cohort_table_name,
        )
        with self.get_for_update() as experiment:
            experiment.feature_blocks = len(aggregations)
        return aggregations

    @cachedproperty
    def feature_aggregation_table_tasks(self):
        """All feature table query tasks specified by this
        ``Experiment``.

        Returns: (dict) keys are group table names, values are
            themselves dicts, each with keys for different stages of
            table creation (prepare, inserts, finalize) and with values
            being lists of SQL commands

        """
        logger.spam(
            f"Calculating feature aggregation tasks for {len(self.all_as_of_times)} as_of_times"
        )
        return self.feature_generator.generate_all_table_tasks(
            self.collate_aggregations, task_type="aggregation"
        )

    @cachedproperty
    def feature_imputation_table_tasks(self):
        """All feature imputation query tasks specified by this
        ``Experiment``.

        Returns: (dict) keys are group table names, values are
            themselves dicts, each with keys for different stages of
            table creation (prepare, inserts, finalize) and with values
            being lists of SQL commands

        """
        logger.spam(
            f"Calculating feature imputation tasks for {len(self.all_as_of_times)} as_of_times"
        )
        return self.feature_generator.generate_all_table_tasks(
            self.collate_aggregations, task_type="imputation"
        )

    @cachedproperty
    def master_feature_dictionary(self):
        """All possible features found in the database. Not all features
        will necessarily end up in matrices

        Returns: (list) of dicts, keys being feature table names and
        values being lists of feature names

        """
        result = self.feature_dictionary_creator.feature_dictionary(
            feature_table_names=self.feature_imputation_table_tasks.keys(),
            index_column_lookup=self.feature_generator.index_column_lookup(
                self.collate_aggregations
            ),
        )
        logger.debug(f"Computed master feature dictionary: {result}")
        with self.get_for_update() as experiment:
            experiment.total_features = sum(
                1 for _feature in itertools.chain.from_iterable(result.values())
            )
        return result

    @cachedproperty
    def feature_dicts(self):
        """Feature dictionaries, representing the feature tables and
        columns configured in this experiment after computing feature
        groups.

        Returns: (list) of dicts, keys being feature table names and
        values being lists of feature names

        """
        if not self.master_feature_dictionary:
            logger.warning(
                "No features have been created. Either there is no feature configuration"
                "or there was some problem processing them."
            )
            return []
        combinations = self.feature_group_mixer.generate(
            self.feature_group_creator.subsets(self.master_feature_dictionary)
        )
        with self.get_for_update() as experiment:
            experiment.feature_group_combinations = len(combinations)
        return combinations

    @cachedproperty
    def matrix_build_tasks(self):
        """Tasks for all matrices that need to be built as a part of
        this Experiment.

        Each task contains arguments understood by
        ``Architect.build_matrix``.

        Returns: (list) of dicts

        """
        if not table_has_data(self.cohort_table_name, self.db_engine):
            logger.warning("cohort table is not populated, cannot build any matrices")
            return {}
        if not table_has_data(self.labels_table_name, self.db_engine):
            logger.warning("labels table is not populated, cannot build any matrices")
            return {}
        (updated_split_definitions, matrix_build_tasks) = self.planner.generate_plans(
            self.split_definitions, self.feature_dicts
        )
        self.full_matrix_definitions = updated_split_definitions
        return matrix_build_tasks

    @cachedproperty
    def full_matrix_definitions(self):
        """Full matrix definitions

        Returns: (list) temporal and feature information for each matrix

        """
        (updated_split_definitions, matrix_build_tasks) = self.planner.generate_plans(
            self.split_definitions, self.feature_dicts
        )
        self.matrix_build_tasks = matrix_build_tasks
        return updated_split_definitions

    @property
    def all_label_timespans(self):
        """All train and test label timespans

        Returns: (list) label timespans, in string form as they appeared in the experiment config

        """
        return list(
            set(
                self.config["temporal_config"]["training_label_timespans"]
                + self.config["temporal_config"]["test_label_timespans"]
            )
        )

    @cachedproperty
    def subset_tasks(self):
        return self.subsetter.generate_tasks(self.subsets)

    @experiment_entrypoint
    def generate_labels(self):
        """Generate labels based on experiment configuration

        Results are stored in the database, not returned
        """
        logger.info("Setting up labels")
        self.label_generator.generate_all_labels(
            self.labels_table_name, self.all_as_of_times, self.all_label_timespans
        )
        logger.success(
            f"Labels set up in the table {self.labels_table_name} successfully "
        )

    @experiment_entrypoint
    def generate_cohort(self):
        logger.info("Setting up cohort")
        self.cohort_table_generator.generate_entity_date_table(
            as_of_dates=self.all_as_of_times
        )
        logger.success(
            f"Cohort set up in the table {self.cohort_table_name} successfully"
        )

    @experiment_entrypoint
    def generate_protected_groups(self):
        """Generate protected groups table based on experiment configuration

        Results are stored in the database, not returned
        """
        self.protected_groups_generator.generate_all_dates(
            self.all_as_of_times, self.cohort_table_name, self.cohort_hash
        )

    def log_split(self, split_num, split):
        logger.info(
            "Starting train/test for %s out of %s: train range: %s to %s",
            split_num + 1,
            len(self.full_matrix_definitions),
            split["train_matrix"]["first_as_of_time"],
            split["train_matrix"]["matrix_info_end_time"],
        )

    @abstractmethod
    def process_subset_tasks(self, subset_tasks):
        pass

    @abstractmethod
    def process_train_test_batches(self, train_test_batches):
        pass

    @abstractmethod
    def process_query_tasks(self, query_tasks):
        pass

    @abstractmethod
    def process_matrix_build_tasks(self, matrix_build_tasks):
        pass

    @experiment_entrypoint
    def generate_preimputation_features(self):
        logger.info("Creating features tables (before imputation) ")
        self.process_query_tasks(self.feature_aggregation_table_tasks)
        logger.success(
            f"Features (before imputation) were stored in the tables "
            f"{','.join(agg.get_table_name() for agg in self.collate_aggregations)} "
            f"successfully"
        )

    @experiment_entrypoint
    def impute_missing_features(self):
        logger.info("Imputing missing values in features")
        self.process_query_tasks(self.feature_imputation_table_tasks)
        logger.success(
            f"Imputed features were stored in the tables "
            f"{','.join(agg.get_table_name(imputed=True) for agg in self.collate_aggregations)} "
            f"successfully"
        )

    def build_matrices(self):
        associate_matrices_with_experiment(
            self.experiment_hash, self.matrix_build_tasks.keys(), self.db_engine
        )
        logger.info("Building matrices")
        logger.verbose(
            f"It is necessary to build {len(self.matrix_build_tasks.keys())} matrices"
        )
        with self.get_for_update() as experiment:
            experiment.matrices_needed = len(self.matrix_build_tasks.keys())
        record_matrix_building_started(self.run_id, self.db_engine)
        self.process_matrix_build_tasks(self.matrix_build_tasks)
        logger.success(
            f"Matrices were stored in {self.project_path}/matrices successfully"
        )

    @experiment_entrypoint
    def generate_matrices(self):
        self.all_as_of_times  # Forcing the calculation of all the as of times, so the logging makes more sense
        self.generate_labels()
        self.generate_cohort()
        self.generate_preimputation_features()
        self.impute_missing_features()
        self.build_matrices()

    @experiment_entrypoint
    def generate_subsets(self):
        self.process_subset_tasks(self.subset_tasks)

    def _all_train_test_batches(self):
        """A batch is a model_group to be train, test and evaluated"""
        if "grid_config" not in self.config:
            logger.warning(
                "No grid_config was passed in the experiment config. No models will be trained"
            )
            return

        return self.model_train_tester.generate_task_batches(
            splits=self.full_matrix_definitions,
            grid_config=self.config.get("grid_config"),
            model_comment=self.config.get("model_comment", None),
        )

    @experiment_entrypoint
    def train_and_test_models(self):
        batches = self._all_train_test_batches()
        if not batches:
            logger.notice("No train/test tasks found, so no training to do")
            return

        with self.get_for_update() as experiment:
            experiment.grid_size = sum(
                1
                for _param in self.trainer.flattened_grid_config(
                    self.config.get("grid_config")
                )
            )
            logger.info(
                f"{experiment.grid_size} models groups will be trained, tested and evaluated"
            )

        logger.info(f"Training, testing and evaluating models")
        logger.verbose(f"{len(batches)} train/test tasks found.")
        model_hashes = set(
            task["train_kwargs"]["model_hash"]
            for batch in batches
            for task in batch.tasks
        )
        associate_models_with_experiment(
            self.experiment_hash, model_hashes, self.db_engine
        )
        with self.get_for_update() as experiment:
            experiment.models_needed = len(model_hashes)
        record_model_building_started(self.run_id, self.db_engine)
        self.process_train_test_batches(batches)
        logger.success("Training, testing and evaluating models completed")

    def validate(self, strict=True):
        ExperimentValidator(self.db_engine, strict=strict).run(self.config)

    def _run(self):
        if not self.skip_validation:
            self.validate()

        try:
            self.generate_matrices()
            self.generate_subsets()
            self.generate_protected_groups()
            self.train_and_test_models()
            self._log_end_of_run_report()
        except Exception:
            logger.error("Uh oh... Houston we have a problem")
            raise
        finally:
            if self.cleanup:
                self.clean_up_matrix_building_tables()
                self.clean_up_subset_tables()
                logger.notice(
                    "Cleanup flag was set to True, so label, cohort and subset tables were deleted"
                )

    def _log_end_of_run_report(self):
        missing_matrices = missing_matrix_uuids(self.experiment_hash, self.db_engine)
        if len(missing_matrices) > 0:
            logger.notice(
                f"Found {len(missing_matrices)} missing matrix uuids."
                f"This means that they were supposed to either be build or reused"
                f"by this experiment but are not present in the matrices table."
                f"Inspect the logs for any matrix building errors. Full list: {missing_matrices}",
            )
        else:
            logger.success(
                "All matrices that were supposed to be build were built. Awesome!"
            )

        missing_models = missing_model_hashes(self.experiment_hash, self.db_engine)
        if len(missing_models) > 0:
            logger.notice(
                f"Found {len(missing_models)} missing model hashes. "
                f"This means that they were supposed to either be trained or reused "
                f"by this experiment but are not present in the models table. "
                f"Inspect the logs for any training errors. Full list: {missing_models}"
            )
        else:
            logger.success(
                "All models that were supposed to be trained were trained. Awesome!"
            )

    def clean_up_matrix_building_tables(self):
        logger.debug("Cleaning up cohort and labels tables")
        with timeout(self.cleanup_timeout):
            self.cohort_table_generator.clean_up()
            self.label_generator.clean_up(self.labels_table_name)
        logger.debug("Cleaning up cohort and labels tables: completed")

    def clean_up_subset_tables(self):
        logger.debug("Cleaning up cohort and labels tables")
        with timeout(self.cleanup_timeout):
            for subset_task in self.subset_tasks:
                subset_task["subset_table_generator"].clean_up()
        logger.debug("Cleaning up cohort and labels tables: completed")

    def _run_profile(self):
        cp = cProfile.Profile()
        cp.runcall(self._run)
        store = self.project_storage.get_store(
            ["profiling_stats"], f"{int(time.time())}.profile"
        )
        with store.open("wb") as fd:
            cp.create_stats()
            marshal.dump(cp.stats, fd)
            logger.spam(
                f"Profiling stats of this Triage run calculated and written to {store}"
                f"in cProfile format."
            )

    @experiment_entrypoint
    def run(self):
        try:
            if self.profile:
                self._run_profile()
            else:
                self._run()
        except Exception:
            logger.exception("Run interrupted by uncaught exception")
            raise

    __call__ = run
Example #11
0
def predict_forward_with_existed_model(db_engine, project_path, model_id,
                                       as_of_date):
    """Predict forward given model_id and as_of_date and store the prediction in database

    Args:
            db_engine (sqlalchemy.db.engine)
            project_storage (catwalk.storage.ProjectStorage)
            model_id (int) The id of a given model in the database
            as_of_date (string) a date string like "YYYY-MM-DD"
    """
    logger.spam("In PREDICT LIST................")
    upgrade_db(db_engine=db_engine)
    project_storage = ProjectStorage(project_path)
    matrix_storage_engine = project_storage.matrix_storage_engine()
    # 1. Get feature and cohort config from database
    (train_matrix_uuid,
     matrix_metadata) = train_matrix_info_from_model_id(db_engine, model_id)
    experiment_config = experiment_config_from_model_id(db_engine, model_id)

    # 2. Generate cohort
    cohort_table_name = f"triage_production.cohort_{experiment_config['cohort_config']['name']}"
    cohort_table_generator = EntityDateTableGenerator(
        db_engine=db_engine,
        query=experiment_config['cohort_config']['query'],
        entity_date_table_name=cohort_table_name)
    cohort_table_generator.generate_entity_date_table(
        as_of_dates=[dt_from_str(as_of_date)])

    # 3. Generate feature aggregations
    feature_generator = FeatureGenerator(
        db_engine=db_engine,
        features_schema_name="triage_production",
        feature_start_time=experiment_config['temporal_config']
        ['feature_start_time'],
    )
    collate_aggregations = feature_generator.aggregations(
        feature_aggregation_config=experiment_config['feature_aggregations'],
        feature_dates=[as_of_date],
        state_table=cohort_table_name)
    feature_generator.process_table_tasks(
        feature_generator.generate_all_table_tasks(collate_aggregations,
                                                   task_type='aggregation'))

    # 4. Reconstruct feature disctionary from feature_names and generate imputation

    reconstructed_feature_dict = FeatureGroup()
    imputation_table_tasks = OrderedDict()

    for aggregation in collate_aggregations:
        feature_group, feature_names = get_feature_names(
            aggregation, matrix_metadata)
        reconstructed_feature_dict[feature_group] = feature_names

        # Make sure that the features imputed in training should also be imputed in production

        features_imputed_in_train = get_feature_needs_imputation_in_train(
            aggregation, feature_names)

        features_imputed_in_production = get_feature_needs_imputation_in_production(
            aggregation, db_engine)

        total_impute_cols = set(features_imputed_in_production) | set(
            features_imputed_in_train)
        total_nonimpute_cols = set(f for f in set(feature_names)
                                   if '_imp' not in f) - total_impute_cols

        task_generator = feature_generator._generate_imp_table_tasks_for

        imputation_table_tasks.update(
            task_generator(aggregation,
                           impute_cols=list(total_impute_cols),
                           nonimpute_cols=list(total_nonimpute_cols)))
    feature_generator.process_table_tasks(imputation_table_tasks)

    # 5. Build matrix
    db_config = {
        "features_schema_name": "triage_production",
        "labels_schema_name": "public",
        "cohort_table_name": cohort_table_name,
    }

    matrix_builder = MatrixBuilder(
        db_config=db_config,
        matrix_storage_engine=matrix_storage_engine,
        engine=db_engine,
        experiment_hash=None,
        replace=True,
    )

    feature_start_time = experiment_config['temporal_config'][
        'feature_start_time']
    label_name = experiment_config['label_config']['name']
    label_type = 'binary'
    cohort_name = experiment_config['cohort_config']['name']
    user_metadata = experiment_config['user_metadata']

    # Use timechop to get the time definition for production
    temporal_config = experiment_config["temporal_config"]
    temporal_config.update(
        temporal_params_from_matrix_metadata(db_engine, model_id))
    timechopper = Timechop(**temporal_config)
    prod_definitions = timechopper.define_test_matrices(
        train_test_split_time=dt_from_str(as_of_date),
        test_duration=temporal_config['test_durations'][0],
        test_label_timespan=temporal_config['test_label_timespans'][0])

    matrix_metadata = Planner.make_metadata(
        prod_definitions[-1],
        reconstructed_feature_dict,
        label_name,
        label_type,
        cohort_name,
        'production',
        feature_start_time,
        user_metadata,
    )

    matrix_metadata['matrix_id'] = str(
        as_of_date) + f'_model_id_{model_id}' + '_risklist'

    matrix_uuid = filename_friendly_hash(matrix_metadata)

    matrix_builder.build_matrix(
        as_of_times=[as_of_date],
        label_name=label_name,
        label_type=label_type,
        feature_dictionary=reconstructed_feature_dict,
        matrix_metadata=matrix_metadata,
        matrix_uuid=matrix_uuid,
        matrix_type="production",
    )

    # 6. Predict the risk score for production
    predictor = Predictor(
        model_storage_engine=project_storage.model_storage_engine(),
        db_engine=db_engine,
        rank_order='best')

    predictor.predict(
        model_id=model_id,
        matrix_store=matrix_storage_engine.get_store(matrix_uuid),
        misc_db_parameters={},
        train_matrix_columns=matrix_storage_engine.get_store(
            train_matrix_uuid).columns())