Beispiel #1
0
def basic_integration_test(
    cohort_names,
    feature_group_create_rules,
    feature_group_mix_rules,
    expected_matrix_multiplier,
    expected_group_lists,
):
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        Base.metadata.create_all(db_engine)
        populate_source_data(db_engine)

        with TemporaryDirectory() as temp_dir:
            chopper = Timechop(
                feature_start_time=datetime(2010, 1, 1),
                feature_end_time=datetime(2014, 1, 1),
                label_start_time=datetime(2011, 1, 1),
                label_end_time=datetime(2014, 1, 1),
                model_update_frequency="1year",
                training_label_timespans=["6months"],
                test_label_timespans=["6months"],
                training_as_of_date_frequencies="1day",
                test_as_of_date_frequencies="3months",
                max_training_histories=["1months"],
                test_durations=["1months"],
            )

            entity_date_table_generator = EntityDateTableGenerator(
                db_engine=db_engine,
                entity_date_table_name="cohort_abcd",
                query="select distinct(entity_id) from events")

            label_generator = LabelGenerator(
                db_engine=db_engine,
                query=sample_config()["label_config"]["query"])

            feature_generator = FeatureGenerator(
                db_engine=db_engine,
                features_schema_name="features",
                replace=True)

            feature_dictionary_creator = FeatureDictionaryCreator(
                db_engine=db_engine, features_schema_name="features")

            feature_group_creator = FeatureGroupCreator(
                feature_group_create_rules)

            feature_group_mixer = FeatureGroupMixer(feature_group_mix_rules)
            project_storage = ProjectStorage(temp_dir)
            planner = Planner(
                feature_start_time=datetime(2010, 1, 1),
                label_names=["outcome"],
                label_types=["binary"],
                cohort_names=cohort_names,
                user_metadata={},
            )

            builder = MatrixBuilder(
                engine=db_engine,
                db_config={
                    "features_schema_name": "features",
                    "labels_schema_name": "public",
                    "labels_table_name": "labels",
                    "cohort_table_name": "cohort_abcd",
                },
                experiment_hash=None,
                matrix_storage_engine=project_storage.matrix_storage_engine(),
                replace=True,
            )

            # chop time
            split_definitions = chopper.chop_time()
            num_split_matrices = sum(1 + len(split["test_matrices"])
                                     for split in split_definitions)

            # generate as_of_times for feature/label/state generation
            all_as_of_times = []
            for split in split_definitions:
                all_as_of_times.extend(split["train_matrix"]["as_of_times"])
                for test_matrix in split["test_matrices"]:
                    all_as_of_times.extend(test_matrix["as_of_times"])
            all_as_of_times = list(set(all_as_of_times))

            # generate entity_date state table
            entity_date_table_generator.generate_entity_date_table(
                as_of_dates=all_as_of_times)

            # create labels table
            label_generator.generate_all_labels(
                labels_table="labels",
                as_of_dates=all_as_of_times,
                label_timespans=["6months"],
            )

            # create feature table tasks
            # we would use FeatureGenerator#create_all_tables but want to use
            # the tasks dict directly to create a feature dict
            aggregations = feature_generator.aggregations(
                feature_aggregation_config=[
                    {
                        "prefix":
                        "cat",
                        "from_obj":
                        "cat_complaints",
                        "knowledge_date_column":
                        "as_of_date",
                        "aggregates": [{
                            "quantity": "cat_sightings",
                            "metrics": ["count", "avg"],
                            "imputation": {
                                "all": {
                                    "type": "mean"
                                }
                            },
                        }],
                        "intervals": ["1y"],
                        "groups": ["entity_id"],
                    },
                    {
                        "prefix":
                        "dog",
                        "from_obj":
                        "dog_complaints",
                        "knowledge_date_column":
                        "as_of_date",
                        "aggregates_imputation": {
                            "count": {
                                "type": "constant",
                                "value": 7
                            },
                            "sum": {
                                "type": "mean"
                            },
                            "avg": {
                                "type": "zero"
                            },
                        },
                        "aggregates": [{
                            "quantity": "dog_sightings",
                            "metrics": ["count", "avg"]
                        }],
                        "intervals": ["1y"],
                        "groups": ["entity_id"],
                    },
                ],
                feature_dates=all_as_of_times,
                state_table=entity_date_table_generator.entity_date_table_name,
            )
            feature_table_agg_tasks = feature_generator.generate_all_table_tasks(
                aggregations, task_type="aggregation")

            # create feature aggregation tables
            feature_generator.process_table_tasks(feature_table_agg_tasks)

            feature_table_imp_tasks = feature_generator.generate_all_table_tasks(
                aggregations, task_type="imputation")

            # create feature imputation tables
            feature_generator.process_table_tasks(feature_table_imp_tasks)

            # build feature dictionaries from feature tables and
            # subsetting config
            master_feature_dict = feature_dictionary_creator.feature_dictionary(
                feature_table_names=feature_table_imp_tasks.keys(),
                index_column_lookup=feature_generator.index_column_lookup(
                    aggregations),
            )

            feature_dicts = feature_group_mixer.generate(
                feature_group_creator.subsets(master_feature_dict))

            # figure out what matrices need to be built
            _, matrix_build_tasks = planner.generate_plans(
                split_definitions, feature_dicts)

            # go and build the matrices
            builder.build_all_matrices(matrix_build_tasks)

            # super basic assertion: did matrices we expect get created?
            matrices_records = list(
                db_engine.execute(
                    """select matrix_uuid, num_observations, matrix_type
                    from triage_metadata.matrices
                    """))
            matrix_directory = os.path.join(temp_dir, "matrices")
            matrices = [
                path for path in os.listdir(matrix_directory) if ".csv" in path
            ]
            metadatas = [
                path for path in os.listdir(matrix_directory)
                if ".yaml" in path
            ]
            assert len(matrices) == num_split_matrices * \
                expected_matrix_multiplier
            assert len(metadatas) == num_split_matrices * \
                expected_matrix_multiplier
            assert len(matrices) == len(matrices_records)
            feature_group_name_lists = []
            for metadata_path in metadatas:
                with open(os.path.join(matrix_directory, metadata_path)) as f:
                    metadata = yaml.full_load(f)
                    feature_group_name_lists.append(metadata["feature_groups"])

            for matrix_uuid, num_observations, matrix_type in matrices_records:
                assert matrix_uuid in matrix_build_tasks  # the hashes of the matrices
                assert type(num_observations) is int
                assert matrix_type == matrix_build_tasks[matrix_uuid][
                    "matrix_type"]

            def deep_unique_tuple(l):
                return set([tuple(i) for i in l])

            assert deep_unique_tuple(
                feature_group_name_lists) == deep_unique_tuple(
                    expected_group_lists)
Beispiel #2
0
def basic_integration_test(state_filters, feature_group_create_rules,
                           feature_group_mix_rules, expected_matrix_multiplier,
                           expected_group_lists):
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        Base.metadata.create_all(db_engine)
        populate_source_data(db_engine)

        with TemporaryDirectory() as temp_dir:
            chopper = Timechop(
                feature_start_time=datetime(2010, 1, 1),
                feature_end_time=datetime(2014, 1, 1),
                label_start_time=datetime(2011, 1, 1),
                label_end_time=datetime(2014, 1, 1),
                model_update_frequency='1year',
                training_label_timespans=['6months'],
                test_label_timespans=['6months'],
                training_as_of_date_frequencies='1day',
                test_as_of_date_frequencies='3months',
                max_training_histories=['1months'],
                test_durations=['1months'],
            )

            state_table_generator = StateTableGeneratorFromDense(
                db_engine=db_engine,
                experiment_hash='abcd',
                dense_state_table='states',
            )

            label_generator = LabelGenerator(
                db_engine=db_engine,
                query=sample_config()['label_config']['query'])

            feature_generator = FeatureGenerator(
                db_engine=db_engine,
                features_schema_name='features',
                replace=True,
            )

            feature_dictionary_creator = FeatureDictionaryCreator(
                db_engine=db_engine, features_schema_name='features')

            feature_group_creator = FeatureGroupCreator(
                feature_group_create_rules)

            feature_group_mixer = FeatureGroupMixer(feature_group_mix_rules)
            project_storage = ProjectStorage(temp_dir)
            planner = Planner(
                feature_start_time=datetime(2010, 1, 1),
                label_names=['outcome'],
                label_types=['binary'],
                states=state_filters,
                user_metadata={},
            )

            builder = MatrixBuilder(
                engine=db_engine,
                db_config={
                    'features_schema_name': 'features',
                    'labels_schema_name': 'public',
                    'labels_table_name': 'labels',
                    'sparse_state_table_name': 'tmp_sparse_states_abcd',
                },
                matrix_storage_engine=project_storage.matrix_storage_engine(),
                replace=True)

            # chop time
            split_definitions = chopper.chop_time()
            num_split_matrices = sum(1 + len(split['test_matrices'])
                                     for split in split_definitions)

            # generate as_of_times for feature/label/state generation
            all_as_of_times = []
            for split in split_definitions:
                all_as_of_times.extend(split['train_matrix']['as_of_times'])
                for test_matrix in split['test_matrices']:
                    all_as_of_times.extend(test_matrix['as_of_times'])
            all_as_of_times = list(set(all_as_of_times))

            # generate sparse state table
            state_table_generator.generate_sparse_table(
                as_of_dates=all_as_of_times)

            # create labels table
            label_generator.generate_all_labels(labels_table='labels',
                                                as_of_dates=all_as_of_times,
                                                label_timespans=['6months'])

            # create feature table tasks
            # we would use FeatureGenerator#create_all_tables but want to use
            # the tasks dict directly to create a feature dict
            aggregations = feature_generator.aggregations(
                feature_aggregation_config=[{
                    'prefix':
                    'cat',
                    'from_obj':
                    'cat_complaints',
                    'knowledge_date_column':
                    'as_of_date',
                    'aggregates': [{
                        'quantity': 'cat_sightings',
                        'metrics': ['count', 'avg'],
                        'imputation': {
                            'all': {
                                'type': 'mean'
                            }
                        }
                    }],
                    'intervals': ['1y'],
                    'groups': ['entity_id']
                }, {
                    'prefix':
                    'dog',
                    'from_obj':
                    'dog_complaints',
                    'knowledge_date_column':
                    'as_of_date',
                    'aggregates_imputation': {
                        'count': {
                            'type': 'constant',
                            'value': 7
                        },
                        'sum': {
                            'type': 'mean'
                        },
                        'avg': {
                            'type': 'zero'
                        }
                    },
                    'aggregates': [{
                        'quantity': 'dog_sightings',
                        'metrics': ['count', 'avg'],
                    }],
                    'intervals': ['1y'],
                    'groups': ['entity_id']
                }],
                feature_dates=all_as_of_times,
                state_table=state_table_generator.sparse_table_name)
            feature_table_agg_tasks = feature_generator.generate_all_table_tasks(
                aggregations, task_type='aggregation')

            # create feature aggregation tables
            feature_generator.process_table_tasks(feature_table_agg_tasks)

            feature_table_imp_tasks = feature_generator.generate_all_table_tasks(
                aggregations, task_type='imputation')

            # create feature imputation tables
            feature_generator.process_table_tasks(feature_table_imp_tasks)

            # build feature dictionaries from feature tables and
            # subsetting config
            master_feature_dict = feature_dictionary_creator.feature_dictionary(
                feature_table_names=feature_table_imp_tasks.keys(),
                index_column_lookup=feature_generator.index_column_lookup(
                    aggregations))

            feature_dicts = feature_group_mixer.generate(
                feature_group_creator.subsets(master_feature_dict))

            # figure out what matrices need to be built
            _, matrix_build_tasks =\
                planner.generate_plans(
                    split_definitions,
                    feature_dicts
                )

            # go and build the matrices
            builder.build_all_matrices(matrix_build_tasks)

            # super basic assertion: did matrices we expect get created?
            matrices_records = list(
                db_engine.execute(
                    '''select matrix_uuid, num_observations, matrix_type
                    from model_metadata.matrices
                    '''))
            matrix_directory = os.path.join(temp_dir, 'matrices')
            matrices = [
                path for path in os.listdir(matrix_directory) if '.csv' in path
            ]
            metadatas = [
                path for path in os.listdir(matrix_directory)
                if '.yaml' in path
            ]
            assert len(matrices) == num_split_matrices * \
                expected_matrix_multiplier
            assert len(metadatas) == num_split_matrices * \
                expected_matrix_multiplier
            assert len(matrices) == len(matrices_records)
            feature_group_name_lists = []
            for metadata_path in metadatas:
                with open(os.path.join(matrix_directory, metadata_path)) as f:
                    metadata = yaml.load(f)
                    feature_group_name_lists.append(metadata['feature_groups'])

            for matrix_uuid, num_observations, matrix_type in matrices_records:
                assert matrix_uuid in matrix_build_tasks  #the hashes of the matrices
                assert type(num_observations) is int
                assert matrix_type == matrix_build_tasks[matrix_uuid][
                    'matrix_type']

            def deep_unique_tuple(l):
                return set([tuple(i) for i in l])

            assert deep_unique_tuple(
                feature_group_name_lists) == deep_unique_tuple(
                    expected_group_lists)