Example #1
0
    def initialize_components(self):
        split_config = self.config["temporal_config"]

        self.chopper = Timechop(**split_config)

        cohort_config = self.config.get("cohort_config", {})
        if "query" in cohort_config:
            self.cohort_table_generator = CohortTableGenerator(
                cohort_table_name=self.cohort_table_name,
                db_engine=self.db_engine,
                query=cohort_config["query"],
                replace=self.replace
            )
        else:
            logging.warning(
                "cohort_config missing or unrecognized. Without a cohort, "
                "you will not be able to make matrices or perform feature imputation."
            )
            self.cohort_table_generator = CohortTableGeneratorNoOp()

        if "label_config" in self.config:
            self.label_generator = LabelGenerator(
                label_name=self.config["label_config"].get("name", None),
                query=self.config["label_config"]["query"],
                replace=self.replace,
                db_engine=self.db_engine,
            )
        else:
            self.label_generator = LabelGeneratorNoOp()
            logging.warning(
                "label_config missing or unrecognized. Without labels, "
                "you will not be able to make matrices."
            )

        self.feature_dictionary_creator = FeatureDictionaryCreator(
            features_schema_name=self.features_schema_name, db_engine=self.db_engine
        )

        self.feature_generator = FeatureGenerator(
            features_schema_name=self.features_schema_name,
            replace=self.replace,
            db_engine=self.db_engine,
            feature_start_time=split_config["feature_start_time"],
            materialize_subquery_fromobjs=self.materialize_subquery_fromobjs
        )

        self.feature_group_creator = FeatureGroupCreator(
            self.config.get("feature_group_definition", {"all": [True]})
        )

        self.feature_group_mixer = FeatureGroupMixer(
            self.config.get("feature_group_strategies", ["all"])
        )

        self.planner = Planner(
            feature_start_time=dt_from_str(split_config["feature_start_time"]),
            label_names=[
                self.config.get("label_config", {}).get("name", DEFAULT_LABEL_NAME)
            ],
            label_types=["binary"],
            cohort_names=[self.config.get("cohort_config", {}).get("name", None)],
            user_metadata=self.config.get("user_metadata", {}),
        )

        self.matrix_builder = MatrixBuilder(
            db_config={
                "features_schema_name": self.features_schema_name,
                "labels_schema_name": "public",
                "labels_table_name": self.labels_table_name,
                "cohort_table_name": self.cohort_table_name,
            },
            matrix_storage_engine=self.matrix_storage_engine,
            experiment_hash=self.experiment_hash,
            include_missing_labels_in_train_as=self.config.get("label_config", {}).get(
                "include_missing_labels_in_train_as", None
            ),
            engine=self.db_engine,
            replace=self.replace,
        )

        self.trainer = ModelTrainer(
            experiment_hash=self.experiment_hash,
            model_storage_engine=self.model_storage_engine,
            model_grouper=ModelGrouper(self.config.get("model_group_keys", [])),
            db_engine=self.db_engine,
            replace=self.replace,
        )

        self.tester = ModelTester(
            model_storage_engine=self.model_storage_engine,
            matrix_storage_engine=self.matrix_storage_engine,
            replace=self.replace,
            db_engine=self.db_engine,
            individual_importance_config=self.config.get("individual_importance", {}),
            evaluator_config=self.config.get("scoring", {}),
        )
Example #2
0
    def initialize_components(self):
        split_config = self.config["temporal_config"]

        self.chopper = Timechop(**split_config)

        cohort_config = self.config.get("cohort_config", {})
        if "query" in cohort_config:
            self.cohort_table_name = "cohort_{}_{}".format(
                cohort_config.get('name', 'default'),
                filename_friendly_hash(cohort_config['query']))
            self.cohort_table_generator = EntityDateTableGenerator(
                entity_date_table_name=self.cohort_table_name,
                db_engine=self.db_engine,
                query=cohort_config["query"],
                replace=self.replace)
        else:
            logging.warning(
                "cohort_config missing or unrecognized. Without a cohort, "
                "you will not be able to make matrices, perform feature imputation, "
                "or save time by only computing features for that cohort.")
            self.features_ignore_cohort = True
            self.cohort_table_name = "cohort_{}".format(self.experiment_hash)
            self.cohort_table_generator = EntityDateTableGeneratorNoOp()

        self.subsets = [None] + self.config.get("scoring", {}).get(
            "subsets", [])

        if "label_config" in self.config:
            label_config = self.config["label_config"]
            self.labels_table_name = "labels_{}_{}".format(
                label_config.get('name', 'default'),
                filename_friendly_hash(label_config['query']))
            self.label_generator = LabelGenerator(
                label_name=label_config.get("name", None),
                query=label_config["query"],
                replace=self.replace,
                db_engine=self.db_engine,
            )
        else:
            self.labels_table_name = "labels_{}".format(self.experiment_hash)
            self.label_generator = LabelGeneratorNoOp()
            logging.warning(
                "label_config missing or unrecognized. Without labels, "
                "you will not be able to make matrices.")

        self.feature_dictionary_creator = FeatureDictionaryCreator(
            features_schema_name=self.features_schema_name,
            db_engine=self.db_engine)

        self.feature_generator = FeatureGenerator(
            features_schema_name=self.features_schema_name,
            replace=self.replace,
            db_engine=self.db_engine,
            feature_start_time=split_config["feature_start_time"],
            materialize_subquery_fromobjs=self.materialize_subquery_fromobjs,
            features_ignore_cohort=self.features_ignore_cohort)

        self.feature_group_creator = FeatureGroupCreator(
            self.config.get("feature_group_definition", {"all": [True]}))

        self.feature_group_mixer = FeatureGroupMixer(
            self.config.get("feature_group_strategies", ["all"]))

        self.planner = Planner(
            feature_start_time=dt_from_str(split_config["feature_start_time"]),
            label_names=[
                self.config.get("label_config",
                                {}).get("name", DEFAULT_LABEL_NAME)
            ],
            label_types=["binary"],
            cohort_names=[
                self.config.get("cohort_config", {}).get("name", None)
            ],
            user_metadata=self.config.get("user_metadata", {}),
        )

        self.matrix_builder = MatrixBuilder(
            db_config={
                "features_schema_name": self.features_schema_name,
                "labels_schema_name": "public",
                "labels_table_name": self.labels_table_name,
                "cohort_table_name": self.cohort_table_name,
            },
            matrix_storage_engine=self.matrix_storage_engine,
            experiment_hash=self.experiment_hash,
            include_missing_labels_in_train_as=self.config.get(
                "label_config", {}).get("include_missing_labels_in_train_as",
                                        None),
            engine=self.db_engine,
            replace=self.replace,
        )

        self.subsetter = Subsetter(db_engine=self.db_engine,
                                   replace=self.replace,
                                   as_of_times=self.all_as_of_times)

        self.trainer = ModelTrainer(
            experiment_hash=self.experiment_hash,
            model_storage_engine=self.model_storage_engine,
            model_grouper=ModelGrouper(self.config.get("model_group_keys",
                                                       [])),
            db_engine=self.db_engine,
            replace=self.replace,
        )

        self.predictor = Predictor(
            db_engine=self.db_engine,
            model_storage_engine=self.model_storage_engine,
            save_predictions=self.save_predictions,
            replace=self.replace,
        )

        self.individual_importance_calculator = IndividualImportanceCalculator(
            db_engine=self.db_engine,
            n_ranks=self.config.get("individual_importance",
                                    {}).get("n_ranks", 5),
            methods=self.config.get("individual_importance",
                                    {}).get("methods", ["uniform"]),
            replace=self.replace,
        )

        self.evaluator = ModelEvaluator(
            db_engine=self.db_engine,
            sort_seed=self.config.get("scoring", {}).get("sort_seed", None),
            testing_metric_groups=self.config.get("scoring", {}).get(
                "testing_metric_groups", []),
            training_metric_groups=self.config.get("scoring", {}).get(
                "training_metric_groups", []),
        )

        self.model_train_tester = ModelTrainTester(
            matrix_storage_engine=self.matrix_storage_engine,
            model_evaluator=self.evaluator,
            model_trainer=self.trainer,
            individual_importance_calculator=self.
            individual_importance_calculator,
            predictor=self.predictor,
            subsets=self.subsets,
        )
Example #3
0
    def initialize_components(self):
        split_config = self.config["temporal_config"]

        self.chopper = Timechop(**split_config)

        if "label_config" in self.config:
            label_config = self.config["label_config"]
            self.labels_table_name = "labels_{}_{}".format(
                label_config.get("name", "default"),
                filename_friendly_hash(label_config["query"]),
            )
            self.label_generator = LabelGenerator(
                label_name=label_config.get("name", None),
                query=label_config["query"],
                replace=self.replace,
                db_engine=self.db_engine,
            )
        else:
            self.labels_table_name = "labels_{}".format(self.experiment_hash)
            self.label_generator = LabelGeneratorNoOp()
            logger.warning(
                "label_config missing or unrecognized. Without labels, "
                "you will not be able to make matrices."
            )
        record_labels_table_name(self.run_id, self.db_engine, self.labels_table_name)

        cohort_config = self.config.get("cohort_config", {})
        self.cohort_table_generator = None
        if "query" in cohort_config:
            self.cohort_hash = filename_friendly_hash(
                self.config["cohort_config"]["query"]
            )
        elif "query" in self.config.get("label_config", {}):
            logger.info(
                "cohort_config missing or unrecognized, but labels are configured. Labels will be used as the cohort."
            )
            self.cohort_hash = filename_friendly_hash(
                self.config["label_config"]["query"]
            )
        else:
            self.features_ignore_cohort = True
            self.cohort_hash = None
            self.cohort_table_name = "cohort_{}".format(self.experiment_hash)
            self.cohort_table_generator = CohortTableGeneratorNoOp()

        if not self.cohort_table_generator:
            self.cohort_table_name = "cohort_{}_{}".format(
                cohort_config.get("name", "default"), self.cohort_hash
            )
            self.cohort_table_generator = EntityDateTableGenerator(
                entity_date_table_name=self.cohort_table_name,
                db_engine=self.db_engine,
                query=cohort_config.get("query", None),
                labels_table_name=self.labels_table_name,
                replace=self.replace,
            )

        record_cohort_table_name(self.run_id, self.db_engine, self.cohort_table_name)

        if "bias_audit_config" in self.config:
            bias_config = self.config["bias_audit_config"]
            self.bias_hash = filename_friendly_hash(bias_config)
            self.protected_groups_table_name = f"protected_groups_{self.bias_hash}"
            self.protected_groups_generator = ProtectedGroupsGenerator(
                db_engine=self.db_engine,
                from_obj=parse_from_obj(bias_config, "bias_from_obj"),
                attribute_columns=bias_config.get("attribute_columns", None),
                entity_id_column=bias_config.get("entity_id_column", None),
                knowledge_date_column=bias_config.get("knowledge_date_column", None),
                protected_groups_table_name=self.protected_groups_table_name,
                replace=self.replace,
            )
            record_bias_hash(self.run_id, self.db_engine, self.bias_hash)
        else:
            self.protected_groups_generator = ProtectedGroupsGeneratorNoOp()
            logger.notice(
                "bias_audit_config missing in the configuration file or unrecognized. "
                "Without protected groups, you will not be able to audit your models for bias and fairness."
            )

        self.feature_dictionary_creator = FeatureDictionaryCreator(
            features_schema_name=self.features_schema_name, db_engine=self.db_engine
        )

        self.feature_generator = FeatureGenerator(
            features_schema_name=self.features_schema_name,
            replace=self.replace,
            db_engine=self.db_engine,
            feature_start_time=split_config["feature_start_time"],
            materialize_subquery_fromobjs=self.materialize_subquery_fromobjs,
            features_ignore_cohort=self.features_ignore_cohort,
        )

        self.feature_group_creator = FeatureGroupCreator(
            self.config.get("feature_group_definition", {"all": [True]})
        )

        self.feature_group_mixer = FeatureGroupMixer(
            self.config.get("feature_group_strategies", ["all"])
        )

        self.planner = Planner(
            feature_start_time=dt_from_str(split_config["feature_start_time"]),
            label_names=[
                self.config.get("label_config", {}).get("name", DEFAULT_LABEL_NAME)
            ],
            label_types=["binary"],
            cohort_names=[self.config.get("cohort_config", {}).get("name", None)],
            user_metadata=self.config.get("user_metadata", {}),
        )

        self.matrix_builder = MatrixBuilder(
            db_config={
                "features_schema_name": self.features_schema_name,
                "labels_schema_name": "public",
                "labels_table_name": self.labels_table_name,
                "cohort_table_name": self.cohort_table_name,
            },
            matrix_storage_engine=self.matrix_storage_engine,
            experiment_hash=self.experiment_hash,
            include_missing_labels_in_train_as=self.config.get("label_config", {}).get(
                "include_missing_labels_in_train_as", None
            ),
            engine=self.db_engine,
            replace=self.replace,
            run_id=self.run_id,
        )

        self.subsets = self.config.get("scoring", {}).get("subsets", [])
        if self.subsets:
            self.subsetter = Subsetter(
                db_engine=self.db_engine,
                replace=self.replace,
                as_of_times=self.all_as_of_times,
            )
        else:
            self.subsetter = SubsetterNoOp()
            logger.notice(
                "scoring.subsets missing in the configuration file or unrecognized. No subsets will be generated"
            )

        self.trainer = ModelTrainer(
            experiment_hash=self.experiment_hash,
            model_storage_engine=self.model_storage_engine,
            model_grouper=ModelGrouper(self.config.get("model_group_keys", [])),
            db_engine=self.db_engine,
            replace=self.replace,
            run_id=self.run_id,
        )

        self.predictor = Predictor(
            db_engine=self.db_engine,
            model_storage_engine=self.model_storage_engine,
            save_predictions=self.save_predictions,
            replace=self.replace,
            rank_order=self.config.get("prediction", {}).get(
                "rank_tiebreaker", "worst"
            ),
        )

        if "individual_importance" in self.config:
            self.individual_importance_calculator = IndividualImportanceCalculator(
                db_engine=self.db_engine,
                n_ranks=self.config.get("individual_importance", {}).get("n_ranks", 5),
                methods=self.config.get("individual_importance", {}).get(
                    "methods", ["uniform"]
                ),
                replace=self.replace,
            )
        else:
            self.individual_importance_calculator = IndividualImportanceCalculatorNoOp()
            logger.notice(
                "individual_importance missing in the configuration file or unrecognized, "
                "you will not be able to do analysis on individual feature importances."
            )

        self.evaluator = ModelEvaluator(
            db_engine=self.db_engine,
            testing_metric_groups=self.config.get("scoring", {}).get(
                "testing_metric_groups", []
            ),
            training_metric_groups=self.config.get("scoring", {}).get(
                "training_metric_groups", []
            ),
            bias_config=self.config.get("bias_audit_config", {}),
        )

        self.model_train_tester = ModelTrainTester(
            matrix_storage_engine=self.matrix_storage_engine,
            model_evaluator=self.evaluator,
            model_trainer=self.trainer,
            individual_importance_calculator=self.individual_importance_calculator,
            predictor=self.predictor,
            subsets=self.subsets,
            protected_groups_generator=self.protected_groups_generator,
            cohort_hash=self.cohort_hash,
            replace=self.replace,
            additional_bigtrain_classnames=self.additional_bigtrain_classnames,
        )
Example #4
0
def test_load_labels_data_include_missing_labels_as_false():
    """ Test the load_labels_data function by checking whether the query
    produces the correct labels
    """
    # set up labeling config variables
    dates = [
        datetime.datetime(2016, 1, 1, 0, 0),
        datetime.datetime(2016, 2, 1, 0, 0),
        datetime.datetime(2016, 6, 1, 0, 0),
    ]

    # same as the other load_labels_data test, except we include an extra date, 2016-06-01
    # this date does have entity 0 included via the states table, but no labels

    # make a dataframe of labels to test against
    labels_df = pd.DataFrame(
        labels,
        columns=[
            "entity_id",
            "as_of_date",
            "label_timespan",
            "label_name",
            "label_type",
            "label",
        ],
    )

    labels_df["as_of_date"] = convert_string_column_to_date(labels_df["as_of_date"])
    labels_df.set_index(["entity_id", "as_of_date"])

    # create an engine and generate a table with fake feature data
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        create_schemas(engine, features_tables, labels, states)
        with get_matrix_storage_engine() as matrix_storage_engine:
            builder = MatrixBuilder(
                db_config=db_config,
                matrix_storage_engine=matrix_storage_engine,
                experiment_hash=experiment_hash,
                engine=engine,
                include_missing_labels_in_train_as=False,
            )

            # make the entity-date table
            entity_date_table_name = builder.make_entity_date_table(
                as_of_times=dates,
                label_type="binary",
                label_name="booking",
                state="active",
                matrix_type="train",
                matrix_uuid="my_uuid",
                label_timespan="1 month",
            )

            result = builder.load_labels_data(
                label_name=label_name,
                label_type=label_type,
                label_timespan="1 month",
                matrix_uuid="my_uuid",
                entity_date_table_name=entity_date_table_name,
            )
            df = pd.DataFrame.from_dict(
                {
                    "entity_id": [0, 2, 3, 4, 4],
                    "as_of_date": [dates[2], dates[1], dates[1], dates[0], dates[1]],
                    "booking": [0, 0, 0, 1, 0],
                }
            ).set_index(["entity_id", "as_of_date"])
            # the first row would not be here if we had not configured the Builder
            # to include missing labels as false

            test = result == df
            assert test.all().all()
Example #5
0
    def test_replace_false_rerun(self):
        with testing.postgresql.Postgresql() as postgresql:
            # create an engine and generate a table with fake feature data
            engine = create_engine(postgresql.url())
            ensure_db(engine)
            create_schemas(
                engine=engine,
                features_tables=features_tables,
                labels=labels,
                states=states,
            )

            dates = [
                datetime.datetime(2016, 1, 1, 0, 0),
                datetime.datetime(2016, 2, 1, 0, 0),
                datetime.datetime(2016, 3, 1, 0, 0),
            ]

            with get_matrix_storage_engine() as matrix_storage_engine:
                builder = MatrixBuilder(
                    db_config=db_config,
                    matrix_storage_engine=matrix_storage_engine,
                    experiment_hash=experiment_hash,
                    engine=engine,
                    replace=False,
                )

                feature_dictionary = {
                    "features0": ["f1", "f2"],
                    "features1": ["f3", "f4"],
                }
                matrix_metadata = {
                    "matrix_id": "hi",
                    "state": "active",
                    "label_name": "booking",
                    "end_time": datetime.datetime(2016, 3, 1, 0, 0),
                    "feature_start_time": datetime.datetime(2016, 1, 1, 0, 0),
                    "label_timespan": "1 month",
                    "test_duration": "1 month",
                    "indices": ["entity_id", "as_of_date"],
                }
                uuid = filename_friendly_hash(matrix_metadata)
                builder.build_matrix(
                    as_of_times=dates,
                    label_name="booking",
                    label_type="binary",
                    feature_dictionary=feature_dictionary,
                    matrix_metadata=matrix_metadata,
                    matrix_uuid=uuid,
                    matrix_type="test",
                )

                assert len(matrix_storage_engine.get_store(uuid).design_matrix) == 5
                # rerun
                builder.make_entity_date_table = Mock()
                builder.build_matrix(
                    as_of_times=dates,
                    label_name="booking",
                    label_type="binary",
                    feature_dictionary=feature_dictionary,
                    matrix_metadata=matrix_metadata,
                    matrix_uuid=uuid,
                    matrix_type="test",
                )
                assert not builder.make_entity_date_table.called
Example #6
0
def test_load_features_data():
    dates = [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0)]

    # make dataframe for entity ids and dates
    ids_dates = create_entity_date_df(
        labels=labels,
        states=states,
        as_of_dates=dates,
        label_name="booking",
        label_type="binary",
        label_timespan="1 month",
    )

    features = [["f1", "f2"], ["f3", "f4"]]
    # make dataframes of features to test against
    features_dfs = []
    for i, table in enumerate(features_tables):
        cols = ["entity_id", "as_of_date"] + features[i]
        temp_df = pd.DataFrame(table, columns=cols)
        temp_df["as_of_date"] = convert_string_column_to_date(temp_df["as_of_date"])
        features_dfs.append(
            ids_dates.merge(
                right=temp_df, how="left", on=["entity_id", "as_of_date"]
            ).set_index(["entity_id", "as_of_date"])
        )

    # create an engine and generate a table with fake feature data
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        create_schemas(
            engine=engine, features_tables=features_tables, labels=labels, states=states
        )

        with get_matrix_storage_engine() as matrix_storage_engine:
            builder = MatrixBuilder(
                db_config=db_config,
                matrix_storage_engine=matrix_storage_engine,
                experiment_hash=experiment_hash,
                engine=engine,
            )

            # make the entity-date table
            entity_date_table_name = builder.make_entity_date_table(
                as_of_times=dates,
                label_type="binary",
                label_name="booking",
                state="active",
                matrix_type="train",
                matrix_uuid="my_uuid",
                label_timespan="1 month",
            )

            feature_dictionary = dict(
                ("features{}".format(i), feature_list)
                for i, feature_list in enumerate(features)
            )

            returned_features_dfs = builder.load_features_data(
                as_of_times=dates,
                feature_dictionary=feature_dictionary,
                entity_date_table_name=entity_date_table_name,
                matrix_uuid="my_uuid",
            )

            # get the queries and test them
            for result, df in zip(returned_features_dfs, features_dfs):
                test = result == df
                assert test.all().all()
Example #7
0
def test_load_labels_data():
    """ Test the load_labels_data function by checking whether the query
    produces the correct labels
    """
    # set up labeling config variables
    dates = [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0)]

    # make a dataframe of labels to test against
    labels_df = pd.DataFrame(
        labels,
        columns=[
            "entity_id",
            "as_of_date",
            "label_timespan",
            "label_name",
            "label_type",
            "label",
        ],
    )

    labels_df["as_of_date"] = convert_string_column_to_date(labels_df["as_of_date"])
    labels_df.set_index(["entity_id", "as_of_date"])

    # create an engine and generate a table with fake feature data
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        create_schemas(engine, features_tables, labels, states)
        with get_matrix_storage_engine() as matrix_storage_engine:
            builder = MatrixBuilder(
                db_config=db_config,
                matrix_storage_engine=matrix_storage_engine,
                experiment_hash=experiment_hash,
                engine=engine,
            )

            # make the entity-date table
            entity_date_table_name = builder.make_entity_date_table(
                as_of_times=dates,
                label_type="binary",
                label_name="booking",
                state="active",
                matrix_type="train",
                matrix_uuid="my_uuid",
                label_timespan="1 month",
            )

            result = builder.load_labels_data(
                label_name=label_name,
                label_type=label_type,
                label_timespan="1 month",
                matrix_uuid="my_uuid",
                entity_date_table_name=entity_date_table_name,
            )
            df = pd.DataFrame.from_dict(
                {
                    "entity_id": [2, 3, 4, 4],
                    "as_of_date": [dates[1], dates[1], dates[0], dates[1]],
                    "booking": [0, 0, 1, 0],
                }
            ).set_index(["entity_id", "as_of_date"])

            test = result == df
            assert test.all().all()
Example #8
0
def basic_integration_test(state_filters, feature_group_create_rules,
                           feature_group_mix_rules, expected_matrix_multiplier,
                           expected_group_lists):
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        Base.metadata.create_all(db_engine)
        populate_source_data(db_engine)

        with TemporaryDirectory() as temp_dir:
            chopper = Timechop(
                feature_start_time=datetime(2010, 1, 1),
                feature_end_time=datetime(2014, 1, 1),
                label_start_time=datetime(2011, 1, 1),
                label_end_time=datetime(2014, 1, 1),
                model_update_frequency='1year',
                training_label_timespans=['6months'],
                test_label_timespans=['6months'],
                training_as_of_date_frequencies='1day',
                test_as_of_date_frequencies='3months',
                max_training_histories=['1months'],
                test_durations=['1months'],
            )

            state_table_generator = StateTableGeneratorFromDense(
                db_engine=db_engine,
                experiment_hash='abcd',
                dense_state_table='states',
            )

            label_generator = LabelGenerator(
                db_engine=db_engine,
                query=sample_config()['label_config']['query'])

            feature_generator = FeatureGenerator(
                db_engine=db_engine,
                features_schema_name='features',
                replace=True,
            )

            feature_dictionary_creator = FeatureDictionaryCreator(
                db_engine=db_engine, features_schema_name='features')

            feature_group_creator = FeatureGroupCreator(
                feature_group_create_rules)

            feature_group_mixer = FeatureGroupMixer(feature_group_mix_rules)
            project_storage = ProjectStorage(temp_dir)
            planner = Planner(
                feature_start_time=datetime(2010, 1, 1),
                label_names=['outcome'],
                label_types=['binary'],
                states=state_filters,
                user_metadata={},
            )

            builder = MatrixBuilder(
                engine=db_engine,
                db_config={
                    'features_schema_name': 'features',
                    'labels_schema_name': 'public',
                    'labels_table_name': 'labels',
                    'sparse_state_table_name': 'tmp_sparse_states_abcd',
                },
                matrix_storage_engine=project_storage.matrix_storage_engine(),
                replace=True)

            # chop time
            split_definitions = chopper.chop_time()
            num_split_matrices = sum(1 + len(split['test_matrices'])
                                     for split in split_definitions)

            # generate as_of_times for feature/label/state generation
            all_as_of_times = []
            for split in split_definitions:
                all_as_of_times.extend(split['train_matrix']['as_of_times'])
                for test_matrix in split['test_matrices']:
                    all_as_of_times.extend(test_matrix['as_of_times'])
            all_as_of_times = list(set(all_as_of_times))

            # generate sparse state table
            state_table_generator.generate_sparse_table(
                as_of_dates=all_as_of_times)

            # create labels table
            label_generator.generate_all_labels(labels_table='labels',
                                                as_of_dates=all_as_of_times,
                                                label_timespans=['6months'])

            # create feature table tasks
            # we would use FeatureGenerator#create_all_tables but want to use
            # the tasks dict directly to create a feature dict
            aggregations = feature_generator.aggregations(
                feature_aggregation_config=[{
                    'prefix':
                    'cat',
                    'from_obj':
                    'cat_complaints',
                    'knowledge_date_column':
                    'as_of_date',
                    'aggregates': [{
                        'quantity': 'cat_sightings',
                        'metrics': ['count', 'avg'],
                        'imputation': {
                            'all': {
                                'type': 'mean'
                            }
                        }
                    }],
                    'intervals': ['1y'],
                    'groups': ['entity_id']
                }, {
                    'prefix':
                    'dog',
                    'from_obj':
                    'dog_complaints',
                    'knowledge_date_column':
                    'as_of_date',
                    'aggregates_imputation': {
                        'count': {
                            'type': 'constant',
                            'value': 7
                        },
                        'sum': {
                            'type': 'mean'
                        },
                        'avg': {
                            'type': 'zero'
                        }
                    },
                    'aggregates': [{
                        'quantity': 'dog_sightings',
                        'metrics': ['count', 'avg'],
                    }],
                    'intervals': ['1y'],
                    'groups': ['entity_id']
                }],
                feature_dates=all_as_of_times,
                state_table=state_table_generator.sparse_table_name)
            feature_table_agg_tasks = feature_generator.generate_all_table_tasks(
                aggregations, task_type='aggregation')

            # create feature aggregation tables
            feature_generator.process_table_tasks(feature_table_agg_tasks)

            feature_table_imp_tasks = feature_generator.generate_all_table_tasks(
                aggregations, task_type='imputation')

            # create feature imputation tables
            feature_generator.process_table_tasks(feature_table_imp_tasks)

            # build feature dictionaries from feature tables and
            # subsetting config
            master_feature_dict = feature_dictionary_creator.feature_dictionary(
                feature_table_names=feature_table_imp_tasks.keys(),
                index_column_lookup=feature_generator.index_column_lookup(
                    aggregations))

            feature_dicts = feature_group_mixer.generate(
                feature_group_creator.subsets(master_feature_dict))

            # figure out what matrices need to be built
            _, matrix_build_tasks =\
                planner.generate_plans(
                    split_definitions,
                    feature_dicts
                )

            # go and build the matrices
            builder.build_all_matrices(matrix_build_tasks)

            # super basic assertion: did matrices we expect get created?
            matrices_records = list(
                db_engine.execute(
                    '''select matrix_uuid, num_observations, matrix_type
                    from model_metadata.matrices
                    '''))
            matrix_directory = os.path.join(temp_dir, 'matrices')
            matrices = [
                path for path in os.listdir(matrix_directory) if '.csv' in path
            ]
            metadatas = [
                path for path in os.listdir(matrix_directory)
                if '.yaml' in path
            ]
            assert len(matrices) == num_split_matrices * \
                expected_matrix_multiplier
            assert len(metadatas) == num_split_matrices * \
                expected_matrix_multiplier
            assert len(matrices) == len(matrices_records)
            feature_group_name_lists = []
            for metadata_path in metadatas:
                with open(os.path.join(matrix_directory, metadata_path)) as f:
                    metadata = yaml.load(f)
                    feature_group_name_lists.append(metadata['feature_groups'])

            for matrix_uuid, num_observations, matrix_type in matrices_records:
                assert matrix_uuid in matrix_build_tasks  #the hashes of the matrices
                assert type(num_observations) is int
                assert matrix_type == matrix_build_tasks[matrix_uuid][
                    'matrix_type']

            def deep_unique_tuple(l):
                return set([tuple(i) for i in l])

            assert deep_unique_tuple(
                feature_group_name_lists) == deep_unique_tuple(
                    expected_group_lists)
Example #9
0
def test_make_entity_date_table_include_missing_labels():
    """ Test that the make_entity_date_table function contains the correct
    values.
    """
    dates = [
        datetime.datetime(2016, 1, 1, 0, 0),
        datetime.datetime(2016, 2, 1, 0, 0),
        datetime.datetime(2016, 3, 1, 0, 0),
        datetime.datetime(2016, 6, 1, 0, 0),
    ]

    # same as the other make_entity_date_label test except there is an extra date, 2016-06-01
    # entity 0 is included in this date via the states table, but has no label

    # make a dataframe of entity ids and dates to test against
    ids_dates = create_entity_date_df(
        labels=labels,
        states=states,
        as_of_dates=dates,
        label_name="booking",
        label_type="binary",
        label_timespan="1 month",
    )
    # this line adds the new entity-date combo as an expected one
    ids_dates = ids_dates.append(
        {"entity_id": 0, "as_of_date": datetime.date(2016, 6, 1)}, ignore_index=True
    )

    with testing.postgresql.Postgresql() as postgresql:
        # create an engine and generate a table with fake feature data
        engine = create_engine(postgresql.url())
        create_schemas(
            engine=engine, features_tables=features_tables, labels=labels, states=states
        )

        with get_matrix_storage_engine() as matrix_storage_engine:
            builder = MatrixBuilder(
                db_config=db_config,
                matrix_storage_engine=matrix_storage_engine,
                experiment_hash=experiment_hash,
                include_missing_labels_in_train_as=False,
                engine=engine,
            )
            engine.execute("CREATE TABLE features.tmp_entity_date (a int, b date);")
            # call the function to test the creation of the table
            entity_date_table_name = builder.make_entity_date_table(
                as_of_times=dates,
                label_type="binary",
                label_name="booking",
                state="active",
                matrix_uuid="my_uuid",
                matrix_type="train",
                label_timespan="1 month",
            )

            # read in the table
            result = pd.read_sql(
                "select * from features.{} order by entity_id, as_of_date".format(
                    entity_date_table_name
                ),
                engine,
            )

            # compare the table to the test dataframe
            assert sorted(result.values.tolist()) == sorted(ids_dates.values.tolist())
Example #10
0
def predict_forward_with_existed_model(db_engine, project_path, model_id,
                                       as_of_date):
    """Predict forward given model_id and as_of_date and store the prediction in database

    Args:
            db_engine (sqlalchemy.db.engine)
            project_storage (catwalk.storage.ProjectStorage)
            model_id (int) The id of a given model in the database
            as_of_date (string) a date string like "YYYY-MM-DD"
    """
    logger.spam("In PREDICT LIST................")
    upgrade_db(db_engine=db_engine)
    project_storage = ProjectStorage(project_path)
    matrix_storage_engine = project_storage.matrix_storage_engine()
    # 1. Get feature and cohort config from database
    (train_matrix_uuid,
     matrix_metadata) = train_matrix_info_from_model_id(db_engine, model_id)
    experiment_config = experiment_config_from_model_id(db_engine, model_id)

    # 2. Generate cohort
    cohort_table_name = f"triage_production.cohort_{experiment_config['cohort_config']['name']}"
    cohort_table_generator = EntityDateTableGenerator(
        db_engine=db_engine,
        query=experiment_config['cohort_config']['query'],
        entity_date_table_name=cohort_table_name)
    cohort_table_generator.generate_entity_date_table(
        as_of_dates=[dt_from_str(as_of_date)])

    # 3. Generate feature aggregations
    feature_generator = FeatureGenerator(
        db_engine=db_engine,
        features_schema_name="triage_production",
        feature_start_time=experiment_config['temporal_config']
        ['feature_start_time'],
    )
    collate_aggregations = feature_generator.aggregations(
        feature_aggregation_config=experiment_config['feature_aggregations'],
        feature_dates=[as_of_date],
        state_table=cohort_table_name)
    feature_generator.process_table_tasks(
        feature_generator.generate_all_table_tasks(collate_aggregations,
                                                   task_type='aggregation'))

    # 4. Reconstruct feature disctionary from feature_names and generate imputation

    reconstructed_feature_dict = FeatureGroup()
    imputation_table_tasks = OrderedDict()

    for aggregation in collate_aggregations:
        feature_group, feature_names = get_feature_names(
            aggregation, matrix_metadata)
        reconstructed_feature_dict[feature_group] = feature_names

        # Make sure that the features imputed in training should also be imputed in production

        features_imputed_in_train = get_feature_needs_imputation_in_train(
            aggregation, feature_names)

        features_imputed_in_production = get_feature_needs_imputation_in_production(
            aggregation, db_engine)

        total_impute_cols = set(features_imputed_in_production) | set(
            features_imputed_in_train)
        total_nonimpute_cols = set(f for f in set(feature_names)
                                   if '_imp' not in f) - total_impute_cols

        task_generator = feature_generator._generate_imp_table_tasks_for

        imputation_table_tasks.update(
            task_generator(aggregation,
                           impute_cols=list(total_impute_cols),
                           nonimpute_cols=list(total_nonimpute_cols)))
    feature_generator.process_table_tasks(imputation_table_tasks)

    # 5. Build matrix
    db_config = {
        "features_schema_name": "triage_production",
        "labels_schema_name": "public",
        "cohort_table_name": cohort_table_name,
    }

    matrix_builder = MatrixBuilder(
        db_config=db_config,
        matrix_storage_engine=matrix_storage_engine,
        engine=db_engine,
        experiment_hash=None,
        replace=True,
    )

    feature_start_time = experiment_config['temporal_config'][
        'feature_start_time']
    label_name = experiment_config['label_config']['name']
    label_type = 'binary'
    cohort_name = experiment_config['cohort_config']['name']
    user_metadata = experiment_config['user_metadata']

    # Use timechop to get the time definition for production
    temporal_config = experiment_config["temporal_config"]
    temporal_config.update(
        temporal_params_from_matrix_metadata(db_engine, model_id))
    timechopper = Timechop(**temporal_config)
    prod_definitions = timechopper.define_test_matrices(
        train_test_split_time=dt_from_str(as_of_date),
        test_duration=temporal_config['test_durations'][0],
        test_label_timespan=temporal_config['test_label_timespans'][0])

    matrix_metadata = Planner.make_metadata(
        prod_definitions[-1],
        reconstructed_feature_dict,
        label_name,
        label_type,
        cohort_name,
        'production',
        feature_start_time,
        user_metadata,
    )

    matrix_metadata['matrix_id'] = str(
        as_of_date) + f'_model_id_{model_id}' + '_risklist'

    matrix_uuid = filename_friendly_hash(matrix_metadata)

    matrix_builder.build_matrix(
        as_of_times=[as_of_date],
        label_name=label_name,
        label_type=label_type,
        feature_dictionary=reconstructed_feature_dict,
        matrix_metadata=matrix_metadata,
        matrix_uuid=matrix_uuid,
        matrix_type="production",
    )

    # 6. Predict the risk score for production
    predictor = Predictor(
        model_storage_engine=project_storage.model_storage_engine(),
        db_engine=db_engine,
        rank_order='best')

    predictor.predict(
        model_id=model_id,
        matrix_store=matrix_storage_engine.get_store(matrix_uuid),
        misc_db_parameters={},
        train_matrix_columns=matrix_storage_engine.get_store(
            train_matrix_uuid).columns())
Example #11
0
    def predict(self, prediction_date):
        """Predict forward by creating a matrix using as_of_date = prediction_date and applying the retrain model on it

        Args:
            prediction_date(str)
        """
        cohort_table_name = f"triage_production.cohort_{self.experiment_config['cohort_config']['name']}_predict"

        # 1. Generate cohort
        self.generate_entity_date_table(prediction_date, cohort_table_name)

        # 2. Generate feature aggregations
        collate_aggregations = self.get_collate_aggregations(
            prediction_date, cohort_table_name)
        self.feature_generator.process_table_tasks(
            self.feature_generator.generate_all_table_tasks(
                collate_aggregations, task_type='aggregation'))
        # 3. Reconstruct feature disctionary from feature_names and generate imputation
        reconstructed_feature_dict, imputation_table_tasks = self.get_feature_dict_and_imputation_task(
            collate_aggregations, self.retrain_model_id)
        self.feature_generator.process_table_tasks(imputation_table_tasks)

        # 4. Build matrix
        db_config = {
            "features_schema_name": "triage_production",
            "labels_schema_name": "public",
            "cohort_table_name": cohort_table_name,
        }

        matrix_builder = MatrixBuilder(
            db_config=db_config,
            matrix_storage_engine=self.matrix_storage_engine,
            engine=self.db_engine,
            experiment_hash=None,
            replace=True,
        )
        # Use timechop to get the time definition for production
        temporal_config = self.get_temporal_config_for_retrain(
            dt_from_str(prediction_date))
        timechopper = Timechop(**temporal_config)

        retrain_config = get_retrain_config_from_model_id(
            self.db_engine, self.retrain_model_id)

        prod_definitions = timechopper.define_test_matrices(
            train_test_split_time=dt_from_str(prediction_date),
            test_duration=retrain_config['test_duration'],
            test_label_timespan=retrain_config['test_label_timespan'])
        last_split_definition = prod_definitions[-1]
        matrix_metadata = Planner.make_metadata(
            matrix_definition=last_split_definition,
            feature_dictionary=reconstructed_feature_dict,
            label_name=self.label_name,
            label_type='binary',
            cohort_name=self.cohort_name,
            matrix_type='production',
            feature_start_time=self.feature_start_time,
            user_metadata=self.user_metadata,
        )

        matrix_metadata['matrix_id'] = str(
            prediction_date
        ) + f'_model_id_{self.retrain_model_id}' + '_risklist'

        matrix_uuid = filename_friendly_hash(matrix_metadata)

        matrix_builder.build_matrix(
            as_of_times=[prediction_date],
            label_name=self.label_name,
            label_type='binary',
            feature_dictionary=reconstructed_feature_dict,
            matrix_metadata=matrix_metadata,
            matrix_uuid=matrix_uuid,
            matrix_type="production",
        )

        # 5. Predict the risk score for production
        predictor = Predictor(
            model_storage_engine=self.project_storage.model_storage_engine(),
            db_engine=self.db_engine,
            rank_order='best')

        predictor.predict(
            model_id=self.retrain_model_id,
            matrix_store=self.matrix_storage_engine.get_store(matrix_uuid),
            misc_db_parameters={},
            train_matrix_columns=self.matrix_storage_engine.get_store(
                self.retrain_matrix_uuid).columns(),
        )
        self.predict_matrix_uuid = matrix_uuid
Example #12
0
    def retrain(self, prediction_date):
        """Retrain a model by going back one split from prediction_date, so the as_of_date for training would be (prediction_date - training_label_timespan)
        
        Args:
            prediction_date(str) 
        """
        # Retrain config and hash
        retrain_config = {
            "model_group_id": self.model_group_id,
            "prediction_date": prediction_date,
            "test_label_timespan": self.test_label_timespan,
            "test_duration": self.test_duration,
        }
        self.retrain_hash = save_retrain_and_get_hash(retrain_config,
                                                      self.db_engine)

        with get_for_update(self.db_engine, Retrain,
                            self.retrain_hash) as retrain:
            retrain.prediction_date = prediction_date

        # Timechop
        prediction_date = dt_from_str(prediction_date)
        temporal_config = self.get_temporal_config_for_retrain(prediction_date)
        timechopper = Timechop(**temporal_config)
        chops = timechopper.chop_time()
        assert len(chops) == 1
        chops_train_matrix = chops[0]['train_matrix']
        as_of_date = datetime.strftime(chops_train_matrix['last_as_of_time'],
                                       "%Y-%m-%d")
        retrain_definition = {
            'first_as_of_time':
            chops_train_matrix['first_as_of_time'],
            'last_as_of_time':
            chops_train_matrix['last_as_of_time'],
            'matrix_info_end_time':
            chops_train_matrix['matrix_info_end_time'],
            'as_of_times': [as_of_date],
            'training_label_timespan':
            chops_train_matrix['training_label_timespan'],
            'max_training_history':
            chops_train_matrix['max_training_history'],
            'training_as_of_date_frequency':
            chops_train_matrix['training_as_of_date_frequency'],
        }

        # Set ExperimentRun
        run = TriageRun(
            start_time=datetime.now(),
            git_hash=infer_git_hash(),
            triage_version=infer_triage_version(),
            python_version=infer_python_version(),
            run_type="retrain",
            run_hash=self.retrain_hash,
            last_updated_time=datetime.now(),
            current_status=TriageRunStatus.started,
            installed_libraries=infer_installed_libraries(),
            platform=platform.platform(),
            os_user=getpass.getuser(),
            working_directory=os.getcwd(),
            ec2_instance_type=infer_ec2_instance_type(),
            log_location=infer_log_location(),
            experiment_class_path=classpath(self.__class__),
            random_seed=retrieve_experiment_seed_from_run_id(
                self.db_engine, self.triage_run_id),
        )
        run_id = None
        with scoped_session(self.db_engine) as session:
            session.add(run)
            session.commit()
            run_id = run.run_id
        if not run_id:
            raise ValueError("Failed to retrieve run_id from saved row")

        # set ModelTrainer's run_id and experiment_hash for Retrain run
        self.model_trainer.run_id = run_id
        self.model_trainer.experiment_hash = self.retrain_hash

        # 1. Generate all labels
        self.generate_all_labels(as_of_date)
        record_labels_table_name(run_id, self.db_engine,
                                 self.labels_table_name)

        # 2. Generate cohort
        cohort_table_name = f"triage_production.cohort_{self.experiment_config['cohort_config']['name']}_retrain"
        self.generate_entity_date_table(as_of_date, cohort_table_name)
        record_cohort_table_name(run_id, self.db_engine, cohort_table_name)

        # 3. Generate feature aggregations
        collate_aggregations = self.get_collate_aggregations(
            as_of_date, cohort_table_name)
        feature_aggregation_table_tasks = self.feature_generator.generate_all_table_tasks(
            collate_aggregations, task_type='aggregation')
        self.feature_generator.process_table_tasks(
            feature_aggregation_table_tasks)

        # 4. Reconstruct feature disctionary from feature_names and generate imputation
        reconstructed_feature_dict, imputation_table_tasks = self.get_feature_dict_and_imputation_task(
            collate_aggregations,
            self.model_group_info['model_id_last_split'],
        )
        feature_group_creator = FeatureGroupCreator(
            self.experiment_config['feature_group_definition'])
        feature_group_mixer = FeatureGroupMixer(["all"])
        feature_group_dict = feature_group_mixer.generate(
            feature_group_creator.subsets(reconstructed_feature_dict))[0]
        self.feature_generator.process_table_tasks(imputation_table_tasks)
        # 5. Build new matrix
        db_config = {
            "features_schema_name": "triage_production",
            "labels_schema_name": "public",
            "cohort_table_name": cohort_table_name,
            "labels_table_name": self.labels_table_name,
        }

        record_matrix_building_started(run_id, self.db_engine)
        matrix_builder = MatrixBuilder(
            db_config=db_config,
            matrix_storage_engine=self.matrix_storage_engine,
            engine=self.db_engine,
            experiment_hash=None,
            replace=True,
        )
        new_matrix_metadata = Planner.make_metadata(
            matrix_definition=retrain_definition,
            feature_dictionary=feature_group_dict,
            label_name=self.label_name,
            label_type='binary',
            cohort_name=self.cohort_name,
            matrix_type='train',
            feature_start_time=dt_from_str(self.feature_start_time),
            user_metadata=self.user_metadata,
        )

        new_matrix_metadata['matrix_id'] = "_".join([
            self.label_name,
            'binary',
            str(as_of_date),
            'retrain',
        ])

        matrix_uuid = filename_friendly_hash(new_matrix_metadata)
        matrix_builder.build_matrix(
            as_of_times=[as_of_date],
            label_name=self.label_name,
            label_type='binary',
            feature_dictionary=feature_group_dict,
            matrix_metadata=new_matrix_metadata,
            matrix_uuid=matrix_uuid,
            matrix_type="train",
        )
        retrain_model_comment = 'retrain_' + str(datetime.now())

        misc_db_parameters = {
            'train_end_time': dt_from_str(as_of_date),
            'test': False,
            'train_matrix_uuid': matrix_uuid,
            'training_label_timespan': self.training_label_timespan,
            'model_comment': retrain_model_comment,
        }

        # get the random seed from the last split
        last_split_train_matrix_uuid, last_split_matrix_metadata = train_matrix_info_from_model_id(
            self.db_engine,
            model_id=self.model_group_info['model_id_last_split'])

        random_seed = self.model_trainer.get_or_generate_random_seed(
            model_group_id=self.model_group_id,
            matrix_metadata=last_split_matrix_metadata,
            train_matrix_uuid=last_split_train_matrix_uuid)

        # create retrain model hash
        retrain_model_hash = self.model_trainer._model_hash(
            self.matrix_storage_engine.get_store(matrix_uuid).metadata,
            class_path=self.model_group_info['model_type'],
            parameters=self.model_group_info['hyperparameters'],
            random_seed=random_seed,
        )

        associate_models_with_retrain(self.retrain_hash,
                                      (retrain_model_hash, ), self.db_engine)

        record_model_building_started(run_id, self.db_engine)
        retrain_model_id = self.model_trainer.process_train_task(
            matrix_store=self.matrix_storage_engine.get_store(matrix_uuid),
            class_path=self.model_group_info['model_type'],
            parameters=self.model_group_info['hyperparameters'],
            model_hash=retrain_model_hash,
            misc_db_parameters=misc_db_parameters,
            random_seed=random_seed,
            retrain=True,
            model_group_id=self.model_group_id)

        self.retrain_model_hash = retrieve_model_hash_from_id(
            self.db_engine, retrain_model_id)
        self.retrain_matrix_uuid = matrix_uuid
        self.retrain_model_id = retrain_model_id
        return {
            'retrain_model_comment': retrain_model_comment,
            'retrain_model_id': retrain_model_id
        }
Example #13
0
def basic_integration_test(
    cohort_names,
    feature_group_create_rules,
    feature_group_mix_rules,
    expected_matrix_multiplier,
    expected_group_lists,
):
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        Base.metadata.create_all(db_engine)
        populate_source_data(db_engine)

        with TemporaryDirectory() as temp_dir:
            chopper = Timechop(
                feature_start_time=datetime(2010, 1, 1),
                feature_end_time=datetime(2014, 1, 1),
                label_start_time=datetime(2011, 1, 1),
                label_end_time=datetime(2014, 1, 1),
                model_update_frequency="1year",
                training_label_timespans=["6months"],
                test_label_timespans=["6months"],
                training_as_of_date_frequencies="1day",
                test_as_of_date_frequencies="3months",
                max_training_histories=["1months"],
                test_durations=["1months"],
            )

            cohort_table_generator = CohortTableGenerator(
                db_engine=db_engine,
                cohort_table_name="cohort_abcd",
                query="select distinct(entity_id) from events"
            )

            label_generator = LabelGenerator(
                db_engine=db_engine, query=sample_config()["label_config"]["query"]
            )

            feature_generator = FeatureGenerator(
                db_engine=db_engine, features_schema_name="features", replace=True
            )

            feature_dictionary_creator = FeatureDictionaryCreator(
                db_engine=db_engine, features_schema_name="features"
            )

            feature_group_creator = FeatureGroupCreator(feature_group_create_rules)

            feature_group_mixer = FeatureGroupMixer(feature_group_mix_rules)
            project_storage = ProjectStorage(temp_dir)
            planner = Planner(
                feature_start_time=datetime(2010, 1, 1),
                label_names=["outcome"],
                label_types=["binary"],
                cohort_names=cohort_names,
                user_metadata={},
            )

            builder = MatrixBuilder(
                engine=db_engine,
                db_config={
                    "features_schema_name": "features",
                    "labels_schema_name": "public",
                    "labels_table_name": "labels",
                    "cohort_table_name": "cohort_abcd",
                },
                experiment_hash=None,
                matrix_storage_engine=project_storage.matrix_storage_engine(),
                replace=True,
            )

            # chop time
            split_definitions = chopper.chop_time()
            num_split_matrices = sum(
                1 + len(split["test_matrices"]) for split in split_definitions
            )

            # generate as_of_times for feature/label/state generation
            all_as_of_times = []
            for split in split_definitions:
                all_as_of_times.extend(split["train_matrix"]["as_of_times"])
                for test_matrix in split["test_matrices"]:
                    all_as_of_times.extend(test_matrix["as_of_times"])
            all_as_of_times = list(set(all_as_of_times))

            # generate cohort state table
            cohort_table_generator.generate_cohort_table(as_of_dates=all_as_of_times)

            # create labels table
            label_generator.generate_all_labels(
                labels_table="labels",
                as_of_dates=all_as_of_times,
                label_timespans=["6months"],
            )

            # create feature table tasks
            # we would use FeatureGenerator#create_all_tables but want to use
            # the tasks dict directly to create a feature dict
            aggregations = feature_generator.aggregations(
                feature_aggregation_config=[
                    {
                        "prefix": "cat",
                        "from_obj": "cat_complaints",
                        "knowledge_date_column": "as_of_date",
                        "aggregates": [
                            {
                                "quantity": "cat_sightings",
                                "metrics": ["count", "avg"],
                                "imputation": {"all": {"type": "mean"}},
                            }
                        ],
                        "intervals": ["1y"],
                        "groups": ["entity_id"],
                    },
                    {
                        "prefix": "dog",
                        "from_obj": "dog_complaints",
                        "knowledge_date_column": "as_of_date",
                        "aggregates_imputation": {
                            "count": {"type": "constant", "value": 7},
                            "sum": {"type": "mean"},
                            "avg": {"type": "zero"},
                        },
                        "aggregates": [
                            {"quantity": "dog_sightings", "metrics": ["count", "avg"]}
                        ],
                        "intervals": ["1y"],
                        "groups": ["entity_id"],
                    },
                ],
                feature_dates=all_as_of_times,
                state_table=cohort_table_generator.cohort_table_name,
            )
            feature_table_agg_tasks = feature_generator.generate_all_table_tasks(
                aggregations, task_type="aggregation"
            )

            # create feature aggregation tables
            feature_generator.process_table_tasks(feature_table_agg_tasks)

            feature_table_imp_tasks = feature_generator.generate_all_table_tasks(
                aggregations, task_type="imputation"
            )

            # create feature imputation tables
            feature_generator.process_table_tasks(feature_table_imp_tasks)

            # build feature dictionaries from feature tables and
            # subsetting config
            master_feature_dict = feature_dictionary_creator.feature_dictionary(
                feature_table_names=feature_table_imp_tasks.keys(),
                index_column_lookup=feature_generator.index_column_lookup(aggregations),
            )

            feature_dicts = feature_group_mixer.generate(
                feature_group_creator.subsets(master_feature_dict)
            )

            # figure out what matrices need to be built
            _, matrix_build_tasks = planner.generate_plans(
                split_definitions, feature_dicts
            )

            # go and build the matrices
            builder.build_all_matrices(matrix_build_tasks)

            # super basic assertion: did matrices we expect get created?
            matrices_records = list(
                db_engine.execute(
                    """select matrix_uuid, num_observations, matrix_type
                    from model_metadata.matrices
                    """
                )
            )
            matrix_directory = os.path.join(temp_dir, "matrices")
            matrices = [path for path in os.listdir(matrix_directory) if ".csv" in path]
            metadatas = [
                path for path in os.listdir(matrix_directory) if ".yaml" in path
            ]
            assert len(matrices) == num_split_matrices * expected_matrix_multiplier
            assert len(metadatas) == num_split_matrices * expected_matrix_multiplier
            assert len(matrices) == len(matrices_records)
            feature_group_name_lists = []
            for metadata_path in metadatas:
                with open(os.path.join(matrix_directory, metadata_path)) as f:
                    metadata = yaml.load(f)
                    feature_group_name_lists.append(metadata["feature_groups"])

            for matrix_uuid, num_observations, matrix_type in matrices_records:
                assert matrix_uuid in matrix_build_tasks  # the hashes of the matrices
                assert type(num_observations) is int
                assert matrix_type == matrix_build_tasks[matrix_uuid]["matrix_type"]

            def deep_unique_tuple(l):
                return set([tuple(i) for i in l])

            assert deep_unique_tuple(feature_group_name_lists) == deep_unique_tuple(
                expected_group_lists
            )
Example #14
0
def test_load_features_data():
    dates = [
        datetime.datetime(2016, 1, 1, 0, 0),
        datetime.datetime(2016, 2, 1, 0, 0)
    ]

    # make dataframe for entity ids and dates
    ids_dates = create_entity_date_df(labels=labels,
                                      states=states,
                                      as_of_dates=dates,
                                      state_one=True,
                                      state_two=True,
                                      label_name='booking',
                                      label_type='binary',
                                      label_timespan='1 month')

    features = [['f1', 'f2'], ['f3', 'f4']]
    # make dataframes of features to test against
    features_dfs = []
    for i, table in enumerate(features_tables):
        cols = ['entity_id', 'as_of_date'] + features[i]
        temp_df = pd.DataFrame(table, columns=cols)
        temp_df['as_of_date'] = convert_string_column_to_date(
            temp_df['as_of_date'])
        features_dfs.append(
            ids_dates.merge(right=temp_df,
                            how='left',
                            on=['entity_id', 'as_of_date'
                                ]).set_index(['entity_id', 'as_of_date']))

    # create an engine and generate a table with fake feature data
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        create_schemas(engine=engine,
                       features_tables=features_tables,
                       labels=labels,
                       states=states)

        with get_matrix_storage_engine() as matrix_storage_engine:
            builder = MatrixBuilder(
                db_config=db_config,
                matrix_storage_engine=matrix_storage_engine,
                engine=engine,
            )

            # make the entity-date table
            entity_date_table_name = builder.make_entity_date_table(
                as_of_times=dates,
                label_type='binary',
                label_name='booking',
                state='state_one AND state_two',
                matrix_type='train',
                matrix_uuid='my_uuid',
                label_timespan='1 month')

            feature_dictionary = dict(
                ('features{}'.format(i), feature_list)
                for i, feature_list in enumerate(features))

            returned_features_dfs = builder.load_features_data(
                as_of_times=dates,
                feature_dictionary=feature_dictionary,
                entity_date_table_name=entity_date_table_name,
                matrix_uuid='my_uuid')

            # get the queries and test them
            for result, df in zip(returned_features_dfs, features_dfs):
                test = (result == df)
                assert (test.all().all())