コード例 #1
0
def test_baseline_exception_handling(sample_matrix_store):
    grid_config = {
        'triage.component.catwalk.baselines.rankers.PercentileRankOneFeature':
        {
            'feature': ['feature_one', 'feature_three']
        }
    }
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        project_path = 'econ-dev/inspections'
        model_storage_engine = S3ModelStorageEngine(project_path)
        ensure_db(db_engine)
        init_engine(db_engine)
        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')
            trainer = ModelTrainer(project_path='econ-dev/inspections',
                                   experiment_hash=None,
                                   model_storage_engine=model_storage_engine,
                                   db_engine=db_engine,
                                   model_grouper=ModelGrouper())

            train_tasks = trainer.generate_train_tasks(grid_config, dict(),
                                                       sample_matrix_store)
            # Creates a matrix entry in the matrices table with uuid from train_metadata
            MatrixFactory(matrix_uuid="1234")
            session.commit()

            model_ids = []
            for train_task in train_tasks:
                model_ids.append(trainer.process_train_task(**train_task))
            assert model_ids == [1, None]
コード例 #2
0
def test_custom_groups(sample_matrix_store, grid_config):
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        ensure_db(engine)
        init_engine(engine)

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')

            MatrixFactory(matrix_uuid="1234")
            session.commit()
            # create training set
            project_path = 'econ-dev/inspections'
            model_storage_engine = S3ModelStorageEngine(project_path)
            trainer = ModelTrainer(
                project_path=project_path,
                experiment_hash=None,
                model_storage_engine=model_storage_engine,
                model_grouper=ModelGrouper(['class_path']),
                db_engine=engine,
            )
            model_ids = trainer.train_models(grid_config=grid_config,
                                             misc_db_parameters=dict(),
                                             matrix_store=sample_matrix_store)
            # expect only one model group now
            records = [
                row[0] for row in engine.execute(
                    'select distinct model_group_id from model_metadata.models'
                )
            ]
            assert len(records) == 1
            assert records[0] == model_ids[0]
コード例 #3
0
    def test_retry_max(self):
        db_engine = None
        trainer = None
        # set up a basic model training run
        # TODO abstract the setup of a basic model training run where
        # we don't worry about the specific values used? it would make
        # tests like this require a bit less noise to read past
        with testing.postgresql.Postgresql() as postgresql:
            db_engine = create_engine(postgresql.url())
            ensure_db(db_engine)
            init_engine(db_engine)
            trainer = ModelTrainer(
                project_path='econ-dev/inspections',
                experiment_hash=None,
                model_storage_engine=InMemoryModelStorageEngine(
                    project_path=''),
                db_engine=db_engine,
                model_grouper=ModelGrouper())

        # the postgres server goes out of scope here and thus no longer exists
        with patch('time.sleep') as time_mock:
            with self.assertRaises(sqlalchemy.exc.OperationalError):
                trainer.train_models(grid_config(), dict(),
                                     sample_matrix_store())
            # we want to make sure that we are using the retrying module sanely
            # as opposed to matching the exact # of calls specified by the code
            assert len(time_mock.mock_calls) > 5
コード例 #4
0
    def test_retry_recovery(self):
        db_engine = None
        trainer = None
        port = None
        with rig_engines() as (db_engine, project_storage):
            port = db_engine.url.port
            trainer = ModelTrainer(
                experiment_hash=None,
                model_storage_engine=project_storage.model_storage_engine(),
                db_engine=db_engine,
                model_grouper=ModelGrouper(),
            )
            matrix_store = get_matrix_store(project_storage)

        # start without a database server
        # then bring it back up after the first sleep
        # use self so it doesn't go out of scope too early and shut down
        self.new_server = None

        def replace_db(arg):
            self.new_server = testing.postgresql.Postgresql(port=port)
            db_engine = create_engine(self.new_server.url())
            ensure_db(db_engine)
            init_engine(db_engine)
            get_matrix_store(project_storage)

        with patch("time.sleep") as time_mock:
            time_mock.side_effect = replace_db
            try:
                trainer.train_models(grid_config(), dict(), matrix_store)
            finally:
                if self.new_server is not None:
                    self.new_server.stop()
            assert len(time_mock.mock_calls) == 1
コード例 #5
0
def test_custom_groups(grid_config, db_engine_with_results_schema, project_storage):
    model_storage_engine = project_storage.model_storage_engine()
    experiment_hash = save_experiment_and_get_hash(
        config={'foo': 'bar'}, 
        db_engine=db_engine_with_results_schema
        )
    run_id = initialize_tracking_and_get_run_id(
        experiment_hash,
        experiment_class_path="",
        random_seed=5,
        experiment_kwargs={},
        db_engine=db_engine_with_results_schema
    )
    trainer = ModelTrainer(
        experiment_hash=experiment_hash,
        model_storage_engine=model_storage_engine,
        model_grouper=ModelGrouper(["class_path"]),
        db_engine=db_engine_with_results_schema,
        run_id=run_id,
    )
    # create training set
    model_ids = trainer.train_models(
        grid_config=grid_config,
        misc_db_parameters=dict(),
        matrix_store=get_matrix_store(project_storage),
    )
    # expect only one model group now
    records = [
        row[0]
        for row in db_engine_with_results_schema.execute(
            "select distinct model_group_id from triage_metadata.models"
        )
    ]
    assert len(records) == 1
    assert records[0] == model_ids[0]
コード例 #6
0
def default_model_trainer(db_engine_with_results_schema, project_storage):
    model_storage_engine = project_storage.model_storage_engine()
    trainer = ModelTrainer(
        experiment_hash=None,
        model_storage_engine=model_storage_engine,
        db_engine=db_engine_with_results_schema,
        model_grouper=ModelGrouper(),
    )
    yield trainer
コード例 #7
0
ファイル: test_model_grouping.py プロジェクト: zshi1/triage
def test_model_grouping_default_config(sample_metadata):
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        ensure_db(engine)
        model_grouper = ModelGrouper()
        # get the basic first model group with our default matrix
        assert (model_grouper.get_model_group_id("module.Classifier",
                                                 {"param1": "val1"},
                                                 sample_metadata, engine) == 1)

        # the end time is not by default a model group key so changing it
        # should still get us the same group
        metadata_new_end_time = copy(sample_metadata)
        metadata_new_end_time["end_time"] = datetime.date(2017, 3, 20)
        assert (model_grouper.get_model_group_id("module.Classifier",
                                                 {"param1": "val1"},
                                                 metadata_new_end_time,
                                                 engine) == 1)

        # max_training_history is a default key,
        # so it should trigger a new group
        metadata_train_history = copy(sample_metadata)
        metadata_train_history["max_training_history"] = "3y"
        assert (model_grouper.get_model_group_id("module.Classifier",
                                                 {"param1": "val1"},
                                                 metadata_train_history,
                                                 engine) == 2)

        # classifier is of course a default key as well
        assert (model_grouper.get_model_group_id("module.OtherClassifier",
                                                 {"param1": "val1"},
                                                 sample_metadata, engine) == 3)
コード例 #8
0
def test_model_grouping_custom_config(sample_metadata):
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        ensure_db(engine)
        model_grouper = ModelGrouper(
            model_group_keys=['feature_names', 'as_of_date_frequency'])
        # get the basic first model group with our default matrix
        assert model_grouper.get_model_group_id('module.Classifier',
                                                {'param1': 'val1'},
                                                sample_metadata, engine) == 1

        # classifier is now not a key, so changing it should not get a new id
        assert model_grouper.get_model_group_id('module.OtherClassifier',
                                                {'param1': 'val1'},
                                                sample_metadata, engine) == 1

        # as_of_date_frequency is a key,
        # so it should trigger a new group
        metadata_frequency = copy(sample_metadata)
        metadata_frequency['as_of_date_frequency'] = '2w'
        assert model_grouper.get_model_group_id('module.Classifier',
                                                {'param1': 'val1'},
                                                metadata_frequency,
                                                engine) == 2

        # testing feature names may seem redundant but it is on a separate
        # code path so make sure its logic works
        metadata_features = copy(sample_metadata)
        metadata_features['feature_names'] = ['ft1', 'ft3']
        assert model_grouper.get_model_group_id('module.Classifier',
                                                {'param1': 'val1'},
                                                metadata_features, engine) == 3
コード例 #9
0
ファイル: test_model_grouping.py プロジェクト: zshi1/triage
def test_model_grouping_custom_config(sample_metadata):
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        ensure_db(engine)
        model_grouper = ModelGrouper(
            model_group_keys=["feature_names", "as_of_date_frequency"])
        # get the basic first model group with our default matrix
        assert (model_grouper.get_model_group_id("module.Classifier",
                                                 {"param1": "val1"},
                                                 sample_metadata, engine) == 1)

        # classifier is now not a key, so changing it should not get a new id
        assert (model_grouper.get_model_group_id("module.OtherClassifier",
                                                 {"param1": "val1"},
                                                 sample_metadata, engine) == 1)

        # as_of_date_frequency is a key,
        # so it should trigger a new group
        metadata_frequency = copy(sample_metadata)
        metadata_frequency["as_of_date_frequency"] = "2w"
        assert (model_grouper.get_model_group_id("module.Classifier",
                                                 {"param1": "val1"},
                                                 metadata_frequency,
                                                 engine) == 2)

        # testing feature names may seem redundant but it is on a separate
        # code path so make sure its logic works
        metadata_features = copy(sample_metadata)
        metadata_features["feature_names"] = ["ft1", "ft3"]
        assert (model_grouper.get_model_group_id("module.Classifier",
                                                 {"param1": "val1"},
                                                 metadata_features,
                                                 engine) == 3)
コード例 #10
0
ファイル: test_model_trainers.py プロジェクト: afcarl/triage
def test_n_jobs_not_new_model(sample_matrix_store):
    grid_config = {
        'sklearn.ensemble.AdaBoostClassifier': {
            'n_estimators': [10, 100, 1000]
        },
        'sklearn.ensemble.RandomForestClassifier': {
            'n_estimators': [10, 100],
            'max_features': ['sqrt', 'log2'],
            'max_depth': [5, 10, 15, 20],
            'criterion': ['gini', 'entropy'],
            'n_jobs': [12, 24],
        }
    }

    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        init_engine(db_engine)
        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')
            trainer = ModelTrainer(project_path='econ-dev/inspections',
                                   experiment_hash=None,
                                   model_storage_engine=S3ModelStorageEngine(
                                       'econ-dev/inspections'),
                                   db_engine=db_engine,
                                   model_grouper=ModelGrouper())

            train_tasks = trainer.generate_train_tasks(
                grid_config,
                dict(),
                sample_matrix_store,
            )
            # Creates a matrix entry in the matrices table with uuid from train_metadata
            MatrixFactory(matrix_uuid="1234")
            session.commit()

            assert len(train_tasks
                       ) == 35  # 32+3, would be (32*2)+3 if we didn't remove
            assert len([
                task for task in train_tasks if 'n_jobs' in task['parameters']
            ]) == 32

            for train_task in train_tasks:
                trainer.process_train_task(**train_task)

            for row in db_engine.execute(
                    'select model_parameters from model_metadata.model_groups'
            ):
                assert 'n_jobs' not in row[0]
コード例 #11
0
ファイル: test_model_trainers.py プロジェクト: snowdj/triage
    def test_retry_max(self):
        db_engine = None
        trainer = None
        # set up a basic model training run
        with rig_engines() as (db_engine, project_storage):
            trainer = ModelTrainer(
                experiment_hash=None,
                model_storage_engine=project_storage.model_storage_engine(),
                db_engine=db_engine,
                model_grouper=ModelGrouper())
            matrix_store = get_matrix_store(project_storage)

        # the postgres server goes out of scope here and thus no longer exists
        with patch('time.sleep') as time_mock:
            with self.assertRaises(sqlalchemy.exc.OperationalError):
                trainer.train_models(grid_config(), dict(), matrix_store)
            # we want to make sure that we are using the retrying module sanely
            # as opposed to matching the exact # of calls specified by the code
            assert len(time_mock.mock_calls) > 5
コード例 #12
0
    def test_retry_recovery(self):
        db_engine = None
        trainer = None
        port = None
        with testing.postgresql.Postgresql() as postgresql:
            port = postgresql.settings['port']
            db_engine = create_engine(postgresql.url())
            ensure_db(db_engine)
            init_engine(db_engine)
            trainer = ModelTrainer(
                project_path='econ-dev/inspections',
                experiment_hash=None,
                model_storage_engine=InMemoryModelStorageEngine(
                    project_path=''),
                db_engine=db_engine,
                model_grouper=ModelGrouper())

        # start without a database server
        # then bring it back up after the first sleep
        # use self so it doesn't go out of scope too early and shut down
        self.new_server = None

        def replace_db(arg):
            self.new_server = testing.postgresql.Postgresql(port=port)
            db_engine = create_engine(self.new_server.url())
            ensure_db(db_engine)
            init_engine(db_engine)

            # Creates a matrix entry in the matrices table with uuid from train_metadata
            MatrixFactory(matrix_uuid="1234")
            session.commit()

        with patch('time.sleep') as time_mock:
            time_mock.side_effect = replace_db
            try:
                trainer.train_models(grid_config(), dict(),
                                     sample_matrix_store())
            finally:
                if self.new_server is not None:
                    self.new_server.stop()
            assert len(time_mock.mock_calls) == 1
コード例 #13
0
ファイル: test_model_trainers.py プロジェクト: snowdj/triage
def test_baseline_exception_handling():
    grid_config = {
        'triage.component.catwalk.baselines.rankers.PercentileRankOneFeature':
        {
            'feature': ['feature_one', 'feature_three']
        }
    }
    with rig_engines() as (db_engine, project_storage):
        trainer = ModelTrainer(
            experiment_hash=None,
            model_storage_engine=project_storage.model_storage_engine(),
            db_engine=db_engine,
            model_grouper=ModelGrouper())

        train_tasks = trainer.generate_train_tasks(
            grid_config, dict(), get_matrix_store(project_storage))

        model_ids = []
        for train_task in train_tasks:
            model_ids.append(trainer.process_train_task(**train_task))
        assert model_ids == [1, None]
コード例 #14
0
ファイル: test_model_trainers.py プロジェクト: snowdj/triage
def test_custom_groups(grid_config):
    with rig_engines() as (db_engine, project_storage):
        # create training set
        model_storage_engine = project_storage.model_storage_engine()
        trainer = ModelTrainer(
            experiment_hash=None,
            model_storage_engine=model_storage_engine,
            model_grouper=ModelGrouper(['class_path']),
            db_engine=db_engine,
        )
        model_ids = trainer.train_models(
            grid_config=grid_config,
            misc_db_parameters=dict(),
            matrix_store=get_matrix_store(project_storage))
        # expect only one model group now
        records = [
            row[0] for row in db_engine.execute(
                'select distinct model_group_id from model_metadata.models')
        ]
        assert len(records) == 1
        assert records[0] == model_ids[0]
コード例 #15
0
def test_custom_groups(grid_config, db_engine_with_results_schema,
                       project_storage):
    model_storage_engine = project_storage.model_storage_engine()
    trainer = ModelTrainer(
        experiment_hash=None,
        model_storage_engine=model_storage_engine,
        model_grouper=ModelGrouper(["class_path"]),
        db_engine=db_engine_with_results_schema,
    )
    # create training set
    model_ids = trainer.train_models(
        grid_config=grid_config,
        misc_db_parameters=dict(),
        matrix_store=get_matrix_store(project_storage),
    )
    # expect only one model group now
    records = [
        row[0] for row in db_engine_with_results_schema.execute(
            "select distinct model_group_id from triage_metadata.models")
    ]
    assert len(records) == 1
    assert records[0] == model_ids[0]
コード例 #16
0
ファイル: test_model_trainers.py プロジェクト: snowdj/triage
def test_n_jobs_not_new_model():
    grid_config = {
        'sklearn.ensemble.AdaBoostClassifier': {
            'n_estimators': [10, 100, 1000]
        },
        'sklearn.ensemble.RandomForestClassifier': {
            'n_estimators': [10, 100],
            'max_features': ['sqrt', 'log2'],
            'max_depth': [5, 10, 15, 20],
            'criterion': ['gini', 'entropy'],
            'n_jobs': [12, 24],
        }
    }

    with rig_engines() as (db_engine, project_storage):
        model_storage_engine = project_storage.model_storage_engine()
        trainer = ModelTrainer(experiment_hash=None,
                               model_storage_engine=model_storage_engine,
                               db_engine=db_engine,
                               model_grouper=ModelGrouper())

        train_tasks = trainer.generate_train_tasks(
            grid_config,
            dict(),
            get_matrix_store(project_storage),
        )

        assert len(
            train_tasks) == 35  # 32+3, would be (32*2)+3 if we didn't remove
        assert len([
            task for task in train_tasks if 'n_jobs' in task['parameters']
        ]) == 32

        for train_task in train_tasks:
            trainer.process_train_task(**train_task)

        for row in db_engine.execute(
                'select hyperparameters from model_metadata.model_groups'):
            assert 'n_jobs' not in row[0]
コード例 #17
0
def default_model_trainer(db_engine_with_results_schema, project_storage):
    model_storage_engine = project_storage.model_storage_engine()
    experiment_hash = save_experiment_and_get_hash(
        config={'foo': 'bar'}, 
        db_engine=db_engine_with_results_schema
        )
    run_id = initialize_tracking_and_get_run_id(
        experiment_hash,
        experiment_class_path="",
        random_seed=5,
        experiment_kwargs={},
        db_engine=db_engine_with_results_schema
    )
    # import pdb; pdb.set_trace()
    trainer = ModelTrainer(
        experiment_hash=experiment_hash,
        model_storage_engine=model_storage_engine,
        db_engine=db_engine_with_results_schema,
        model_grouper=ModelGrouper(),
        run_id=run_id,
    )
    yield trainer
コード例 #18
0
def test_n_jobs_not_new_model():
    grid_config = {
        "sklearn.ensemble.AdaBoostClassifier": {
            "n_estimators": [10, 100, 1000]
        },
        "sklearn.ensemble.RandomForestClassifier": {
            "n_estimators": [10, 100],
            "max_features": ["sqrt", "log2"],
            "max_depth": [5, 10, 15, 20],
            "criterion": ["gini", "entropy"],
            "n_jobs": [12, 24],
        },
    }

    with rig_engines() as (db_engine, project_storage):
        model_storage_engine = project_storage.model_storage_engine()
        trainer = ModelTrainer(
            experiment_hash=None,
            model_storage_engine=model_storage_engine,
            db_engine=db_engine,
            model_grouper=ModelGrouper(),
        )

        train_tasks = trainer.generate_train_tasks(
            grid_config, dict(), get_matrix_store(project_storage))

        assert len(
            train_tasks) == 35  # 32+3, would be (32*2)+3 if we didn't remove
        assert (len([
            task for task in train_tasks if "n_jobs" in task["parameters"]
        ]) == 32)

        for train_task in train_tasks:
            trainer.process_train_task(**train_task)

        for row in db_engine.execute(
                "select hyperparameters from model_metadata.model_groups"):
            assert "n_jobs" not in row[0]
コード例 #19
0
ファイル: base.py プロジェクト: afcarl/triage
    def initialize_components(self):
        split_config = self.config['temporal_config']

        self.chopper = Timechop(
            feature_start_time=dt_from_str(split_config['feature_start_time']),
            feature_end_time=dt_from_str(split_config['feature_end_time']),
            label_start_time=dt_from_str(split_config['label_start_time']),
            label_end_time=dt_from_str(split_config['label_end_time']),
            model_update_frequency=split_config['model_update_frequency'],
            training_label_timespans=split_config['training_label_timespans'],
            test_label_timespans=split_config['test_label_timespans'],
            training_as_of_date_frequencies=split_config[
                'training_as_of_date_frequencies'],
            test_as_of_date_frequencies=split_config[
                'test_as_of_date_frequencies'],
            max_training_histories=split_config['max_training_histories'],
            test_durations=split_config['test_durations'],
        )

        cohort_config = self.config.get('cohort_config', {})
        if 'query' in cohort_config:
            self.state_table_generator = StateTableGeneratorFromQuery(
                experiment_hash=self.experiment_hash,
                db_engine=self.db_engine,
                query=cohort_config['query'])
        elif 'entities_table' in cohort_config:
            self.state_table_generator = StateTableGeneratorFromEntities(
                experiment_hash=self.experiment_hash,
                db_engine=self.db_engine,
                entities_table=cohort_config['entities_table'])
        elif 'dense_states' in cohort_config:
            self.state_table_generator = StateTableGeneratorFromDense(
                experiment_hash=self.experiment_hash,
                db_engine=self.db_engine,
                dense_state_table=cohort_config['dense_states']['table_name'])
        else:
            raise ValueError('Cohort config missing or unrecognized')

        self.label_generator = LabelGenerator(
            label_name=self.config['label_config'].get('name', None),
            query=self.config['label_config']['query'],
            db_engine=self.db_engine,
        )

        self.feature_dictionary_creator = FeatureDictionaryCreator(
            features_schema_name=self.features_schema_name,
            db_engine=self.db_engine,
        )

        self.feature_generator = FeatureGenerator(
            features_schema_name=self.features_schema_name,
            replace=self.replace,
            db_engine=self.db_engine,
            feature_start_time=split_config['feature_start_time'])

        self.feature_group_creator = FeatureGroupCreator(
            self.config.get('feature_group_definition', {'all': [True]}))

        self.feature_group_mixer = FeatureGroupMixer(
            self.config.get('feature_group_strategies', ['all']))

        self.planner = Planner(
            feature_start_time=dt_from_str(split_config['feature_start_time']),
            label_names=[
                self.config.get('label_config',
                                {}).get('name', DEFAULT_LABEL_NAME)
            ],
            label_types=['binary'],
            matrix_directory=self.matrices_directory,
            cohort_name=self.config.get('cohort_config', {}).get('name', None),
            states=self.config.get('cohort_config',
                                   {}).get('dense_states',
                                           {}).get('state_filters', []),
            user_metadata=self.config.get('user_metadata', {}),
        )

        self.matrix_builder = HighMemoryCSVBuilder(
            db_config={
                'features_schema_name':
                self.features_schema_name,
                'labels_schema_name':
                'public',
                'labels_table_name':
                self.labels_table_name,
                # TODO: have planner/builder take state table later on, so we
                # can grab it from the StateTableGenerator instead of
                # duplicating it here
                'sparse_state_table_name':
                'tmp_sparse_states_{}'.format(self.experiment_hash),
            },
            matrix_directory=self.matrices_directory,
            include_missing_labels_in_train_as=self.config['label_config'].get(
                'include_missing_labels_in_train_as', None),
            engine=self.db_engine,
            replace=self.replace)

        self.trainer = ModelTrainer(
            project_path=self.project_path,
            experiment_hash=self.experiment_hash,
            model_storage_engine=self.model_storage_engine,
            model_grouper=ModelGrouper(self.config.get('model_group_keys',
                                                       [])),
            db_engine=self.db_engine,
            replace=self.replace)

        self.tester = ModelTester(
            model_storage_engine=self.model_storage_engine,
            project_path=self.project_path,
            replace=self.replace,
            db_engine=self.db_engine,
            individual_importance_config=self.config.get(
                'individual_importance', {}),
            evaluator_config=self.config.get('scoring', {}))
コード例 #20
0
def test_model_trainer(sample_matrix_store, grid_config):
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        init_engine(db_engine)

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')

            # Creates a matrix entry in the matrices table with uuid from metadata above
            MatrixFactory(matrix_uuid="1234")
            session.commit()
            project_path = 'econ-dev/inspections'
            model_storage_engine = S3ModelStorageEngine(project_path)
            trainer = ModelTrainer(
                project_path=project_path,
                experiment_hash=None,
                model_storage_engine=model_storage_engine,
                model_grouper=ModelGrouper(),
                db_engine=db_engine,
            )
            model_ids = trainer.train_models(grid_config=grid_config,
                                             misc_db_parameters=dict(),
                                             matrix_store=sample_matrix_store)

            # assert
            # 1. that the models and feature importances table entries are present
            records = [
                row for row in db_engine.execute(
                    'select * from train_results.feature_importances')
            ]
            assert len(records) == 4 * 2  # maybe exclude entity_id? yes

            records = [
                row for row in db_engine.execute(
                    'select model_hash from model_metadata.models')
            ]
            assert len(records) == 4
            hashes = [row[0] for row in records]

            # 2. that the model groups are distinct
            records = [
                row for row in db_engine.execute(
                    'select distinct model_group_id from model_metadata.models'
                )
            ]
            assert len(records) == 4

            # 3. that the model sizes are saved in the table and all are < 1 kB
            records = [
                row for row in db_engine.execute(
                    'select model_size from model_metadata.models')
            ]
            assert len(records) == 4
            for i in records:
                size = i[0]
                assert size < 1

            # 4. that all four models are cached
            model_pickles = [
                model_storage_engine.get_store(model_hash).load()
                for model_hash in hashes
            ]
            assert len(model_pickles) == 4
            assert len([x for x in model_pickles if x is not None]) == 4

            # 5. that their results can have predictions made on it
            test_matrix = pandas.DataFrame.from_dict({
                'entity_id': [3, 4],
                'feature_one': [4, 4],
                'feature_two': [6, 5],
            })

            test_matrix = InMemoryMatrixStore(matrix=test_matrix, metadata=sample_metadata())\
                .matrix

            for model_pickle in model_pickles:
                predictions = model_pickle.predict(test_matrix)
                assert len(predictions) == 2

            # 6. when run again, same models are returned
            new_model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=sample_matrix_store)
            assert len([
                row for row in db_engine.execute(
                    'select model_hash from model_metadata.models')
            ]) == 4
            assert model_ids == new_model_ids

            # 7. if replace is set, update non-unique attributes and feature importances
            max_batch_run_time = [
                row[0] for row in db_engine.execute(
                    'select max(batch_run_time) from model_metadata.models')
            ][0]
            trainer = ModelTrainer(
                project_path=project_path,
                experiment_hash=None,
                model_storage_engine=model_storage_engine,
                model_grouper=ModelGrouper(
                    model_group_keys=['label_name', 'label_timespan']),
                db_engine=db_engine,
                replace=True)
            new_model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=sample_matrix_store,
            )
            assert model_ids == new_model_ids
            assert [
                row['model_id'] for row in db_engine.execute(
                    'select model_id from model_metadata.models order by 1 asc'
                )
            ] == model_ids
            new_max_batch_run_time = [
                row[0] for row in db_engine.execute(
                    'select max(batch_run_time) from model_metadata.models')
            ][0]
            assert new_max_batch_run_time > max_batch_run_time

            records = [
                row for row in db_engine.execute(
                    'select * from train_results.feature_importances')
            ]
            assert len(records) == 4 * 2  # maybe exclude entity_id? yes

            # 8. if the cache is missing but the metadata is still there, reuse the metadata
            for row in db_engine.execute(
                    'select model_hash from model_metadata.models'):
                model_storage_engine.get_store(row[0]).delete()
            new_model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=sample_matrix_store)
            assert model_ids == sorted(new_model_ids)

            # 9. that the generator interface works the same way
            new_model_ids = trainer.generate_trained_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=sample_matrix_store)
            assert model_ids == \
                sorted([model_id for model_id in new_model_ids])
コード例 #21
0
def test_reuse_model_random_seeds(grid_config, default_model_trainer):
    trainer = default_model_trainer
    db_engine = trainer.db_engine
    project_storage = trainer.model_storage_engine.project_storage
    model_storage_engine = trainer.model_storage_engine

    # re-using the random seeds requires the association between experiments and models
    # to exist, which we're not getting in these tests since we aren't using the experiment
    # architecture, so back-fill these associations after each train_models() run
    def update_experiment_models(db_engine):
        sql = """
            INSERT INTO triage_metadata.experiment_models(experiment_hash,model_hash) 
            SELECT er.run_hash, m.model_hash
            FROM triage_metadata.models m
            LEFT JOIN triage_metadata.triage_runs er
                ON m.built_in_triage_run = er.id
            LEFT JOIN triage_metadata.experiment_models em 
                ON m.model_hash = em.model_hash
                AND er.run_hash = em.experiment_hash
            WHERE em.experiment_hash IS NULL
            """
        db_engine.execute(sql)
        db_engine.execute('COMMIT;')

    random.seed(5)
    model_ids = trainer.train_models(
        grid_config=grid_config,
        misc_db_parameters=dict(),
        matrix_store=get_matrix_store(project_storage),
    )
    update_experiment_models(db_engine)

    # simulate running a new experiment where the experiment hash has changed
    # (e.g. because the model grid is different), but experiment seed is the
    # same, so previously-trained models should not get new seeds
    experiment_hash = save_experiment_and_get_hash(
        config={'baz': 'qux'}, 
        db_engine=db_engine
        )
    run_id = initialize_tracking_and_get_run_id(
        experiment_hash,
        experiment_class_path="",
        random_seed=5,
        experiment_kwargs={},
        db_engine=db_engine
    )
    trainer = ModelTrainer(
        experiment_hash=experiment_hash,
        model_storage_engine=model_storage_engine,
        db_engine=db_engine,
        model_grouper=ModelGrouper(),
        run_id=run_id,
    )
    new_grid = grid_config.copy()
    new_grid['sklearn.tree.DecisionTreeClassifier']['min_samples_split'] = [3,10,100]
    random.seed(5)
    new_model_ids = trainer.train_models(
        grid_config=new_grid,
        misc_db_parameters=dict(),
        matrix_store=get_matrix_store(project_storage),
    )
    update_experiment_models(db_engine)

    # should have received 5 models
    assert len(new_model_ids) == 6

    # all the original model ids should be in the new set
    assert len(set(new_model_ids) & set(model_ids)) == len(model_ids)

    # however, we should NOT re-use the random seeds (and so get new model_ids)
    # if the experiment-level seed is different
    experiment_hash = save_experiment_and_get_hash(
        config={'lorem': 'ipsum'}, 
        db_engine=db_engine
        )
    run_id = initialize_tracking_and_get_run_id(
        experiment_hash,
        experiment_class_path="",
        random_seed=42,
        experiment_kwargs={},
        db_engine=db_engine
    )
    trainer = ModelTrainer(
        experiment_hash=experiment_hash,
        model_storage_engine=model_storage_engine,
        db_engine=db_engine,
        model_grouper=ModelGrouper(),
        run_id=run_id,
    )
    random.seed(42) # different from above
    newer_model_ids = trainer.train_models(
        grid_config=new_grid,
        misc_db_parameters=dict(),
        matrix_store=get_matrix_store(project_storage),
    )
    update_experiment_models(db_engine)

    # should get entirely new models now (different IDs)
    assert len(newer_model_ids) == 6
    assert len(set(new_model_ids) & set(newer_model_ids)) == 0
コード例 #22
0
def test_model_trainer(grid_config, default_model_trainer):
    trainer = default_model_trainer
    db_engine = trainer.db_engine
    project_storage = trainer.model_storage_engine.project_storage
    model_storage_engine = trainer.model_storage_engine

    def set_test_seed():
        random.seed(5)

    set_test_seed()
    model_ids = trainer.train_models(
        grid_config=grid_config,
        misc_db_parameters=dict(),
        matrix_store=get_matrix_store(project_storage),
    )

    # assert
    # 1. that the models and feature importances table entries are present
    records = [
        row for row in db_engine.execute(
            "select * from train_results.feature_importances")
    ]
    assert len(records) == 4 * 2  # maybe exclude entity_id? yes

    records = [
        row for row in db_engine.execute(
            "select model_hash from triage_metadata.models")
    ]
    assert len(records) == 4
    hashes = [row[0] for row in records]

    # 2. that the model groups are distinct
    records = [
        row for row in db_engine.execute(
            "select distinct model_group_id from triage_metadata.models")
    ]
    assert len(records) == 4

    # 2. that the random seeds are distinct
    records = [
        row for row in db_engine.execute(
            "select distinct random_seed from triage_metadata.models")
    ]
    assert len(records) == 4

    # 3. that the model sizes are saved in the table and all are < 1 kB
    records = [
        row for row in db_engine.execute(
            "select model_size from triage_metadata.models")
    ]
    assert len(records) == 4
    for i in records:
        size = i[0]
        assert size < 1

    # 4. that all four models are cached
    model_pickles = [
        model_storage_engine.load(model_hash) for model_hash in hashes
    ]
    assert len(model_pickles) == 4
    assert len([x for x in model_pickles if x is not None]) == 4

    # 5. that their results can have predictions made on it
    test_matrix = pd.DataFrame.from_dict({
        "entity_id": [3, 4],
        "feature_one": [4, 4],
        "feature_two": [6, 5]
    }).set_index("entity_id")

    for model_pickle in model_pickles:
        predictions = model_pickle.predict(test_matrix)
        assert len(predictions) == 2

    # 6. when run again with the same starting seed, same models are returned
    set_test_seed()
    new_model_ids = trainer.train_models(
        grid_config=grid_config,
        misc_db_parameters=dict(),
        matrix_store=get_matrix_store(project_storage),
    )
    assert (len([
        row for row in db_engine.execute(
            "select model_hash from triage_metadata.models")
    ]) == 4)
    assert model_ids == new_model_ids

    # 7. if replace is set, update non-unique attributes and feature importances
    max_batch_run_time = [
        row[0] for row in db_engine.execute(
            "select max(batch_run_time) from triage_metadata.models")
    ][0]
    trainer = ModelTrainer(
        experiment_hash=None,
        model_storage_engine=model_storage_engine,
        model_grouper=ModelGrouper(
            model_group_keys=["label_name", "label_timespan"]),
        db_engine=db_engine,
        replace=True,
    )
    set_test_seed()
    new_model_ids = trainer.train_models(
        grid_config=grid_config,
        misc_db_parameters=dict(),
        matrix_store=get_matrix_store(project_storage),
    )
    assert model_ids == new_model_ids
    assert [
        row["model_id"] for row in db_engine.execute(
            "select model_id from triage_metadata.models order by 1 asc")
    ] == model_ids
    new_max_batch_run_time = [
        row[0] for row in db_engine.execute(
            "select max(batch_run_time) from triage_metadata.models")
    ][0]
    assert new_max_batch_run_time > max_batch_run_time

    records = [
        row for row in db_engine.execute(
            "select * from train_results.feature_importances")
    ]
    assert len(records) == 4 * 2  # maybe exclude entity_id? yes

    # 8. if the cache is missing but the metadata is still there, reuse the metadata
    set_test_seed()
    for row in db_engine.execute(
            "select model_hash from triage_metadata.models"):
        model_storage_engine.delete(row[0])
    new_model_ids = trainer.train_models(
        grid_config=grid_config,
        misc_db_parameters=dict(),
        matrix_store=get_matrix_store(project_storage),
    )
    assert model_ids == sorted(new_model_ids)

    # 9. that the generator interface works the same way
    set_test_seed()
    new_model_ids = trainer.generate_trained_models(
        grid_config=grid_config,
        misc_db_parameters=dict(),
        matrix_store=get_matrix_store(project_storage),
    )
    assert model_ids == sorted([model_id for model_id in new_model_ids])
コード例 #23
0
ファイル: test_model_trainers.py プロジェクト: snowdj/triage
def test_model_trainer(grid_config):
    with rig_engines() as (db_engine, project_storage):
        # Creates a matrix entry in the matrices table with uuid from metadata above
        model_storage_engine = project_storage.model_storage_engine()
        trainer = ModelTrainer(
            experiment_hash=None,
            model_storage_engine=model_storage_engine,
            model_grouper=ModelGrouper(),
            db_engine=db_engine,
        )
        model_ids = trainer.train_models(
            grid_config=grid_config,
            misc_db_parameters=dict(),
            matrix_store=get_matrix_store(project_storage),
        )

        # assert
        # 1. that the models and feature importances table entries are present
        records = [
            row for row in db_engine.execute(
                'select * from train_results.feature_importances')
        ]
        assert len(records) == 4 * 2  # maybe exclude entity_id? yes

        records = [
            row for row in db_engine.execute(
                'select model_hash from model_metadata.models')
        ]
        assert len(records) == 4
        hashes = [row[0] for row in records]

        # 2. that the model groups are distinct
        records = [
            row for row in db_engine.execute(
                'select distinct model_group_id from model_metadata.models')
        ]
        assert len(records) == 4

        # 3. that the model sizes are saved in the table and all are < 1 kB
        records = [
            row for row in db_engine.execute(
                'select model_size from model_metadata.models')
        ]
        assert len(records) == 4
        for i in records:
            size = i[0]
            assert size < 1

        # 4. that all four models are cached
        model_pickles = [
            model_storage_engine.load(model_hash) for model_hash in hashes
        ]
        assert len(model_pickles) == 4
        assert len([x for x in model_pickles if x is not None]) == 4

        # 5. that their results can have predictions made on it
        test_matrix = pandas.DataFrame.from_dict({
            'entity_id': [3, 4],
            'feature_one': [4, 4],
            'feature_two': [6, 5],
        }).set_index('entity_id')

        for model_pickle in model_pickles:
            predictions = model_pickle.predict(test_matrix)
            assert len(predictions) == 2

        # 6. when run again, same models are returned
        new_model_ids = trainer.train_models(
            grid_config=grid_config,
            misc_db_parameters=dict(),
            matrix_store=get_matrix_store(project_storage))
        assert len([
            row for row in db_engine.execute(
                'select model_hash from model_metadata.models')
        ]) == 4
        assert model_ids == new_model_ids

        # 7. if replace is set, update non-unique attributes and feature importances
        max_batch_run_time = [
            row[0] for row in db_engine.execute(
                'select max(batch_run_time) from model_metadata.models')
        ][0]
        trainer = ModelTrainer(
            experiment_hash=None,
            model_storage_engine=model_storage_engine,
            model_grouper=ModelGrouper(
                model_group_keys=['label_name', 'label_timespan']),
            db_engine=db_engine,
            replace=True)
        new_model_ids = trainer.train_models(
            grid_config=grid_config,
            misc_db_parameters=dict(),
            matrix_store=get_matrix_store(project_storage))
        assert model_ids == new_model_ids
        assert [
            row['model_id'] for row in db_engine.execute(
                'select model_id from model_metadata.models order by 1 asc')
        ] == model_ids
        new_max_batch_run_time = [
            row[0] for row in db_engine.execute(
                'select max(batch_run_time) from model_metadata.models')
        ][0]
        assert new_max_batch_run_time > max_batch_run_time

        records = [
            row for row in db_engine.execute(
                'select * from train_results.feature_importances')
        ]
        assert len(records) == 4 * 2  # maybe exclude entity_id? yes

        # 8. if the cache is missing but the metadata is still there, reuse the metadata
        for row in db_engine.execute(
                'select model_hash from model_metadata.models'):
            model_storage_engine.delete(row[0])
        new_model_ids = trainer.train_models(
            grid_config=grid_config,
            misc_db_parameters=dict(),
            matrix_store=get_matrix_store(project_storage))
        assert model_ids == sorted(new_model_ids)

        # 9. that the generator interface works the same way
        new_model_ids = trainer.generate_trained_models(
            grid_config=grid_config,
            misc_db_parameters=dict(),
            matrix_store=get_matrix_store(project_storage))
        assert model_ids == \
            sorted([model_id for model_id in new_model_ids])
コード例 #24
0
ファイル: base.py プロジェクト: snowdj/triage
    def initialize_components(self):
        split_config = self.config['temporal_config']

        self.chopper = Timechop(**split_config)

        cohort_config = self.config.get('cohort_config', {})
        if 'query' in cohort_config:
            self.state_table_generator = StateTableGeneratorFromQuery(
                experiment_hash=self.experiment_hash,
                db_engine=self.db_engine,
                query=cohort_config['query']
            )
        elif 'entities_table' in cohort_config:
            self.state_table_generator = StateTableGeneratorFromEntities(
                experiment_hash=self.experiment_hash,
                db_engine=self.db_engine,
                entities_table=cohort_config['entities_table']
            )
        elif 'dense_states' in cohort_config:
            self.state_table_generator = StateTableGeneratorFromDense(
                experiment_hash=self.experiment_hash,
                db_engine=self.db_engine,
                dense_state_table=cohort_config['dense_states']['table_name']
            )
        else:
            logging.warning('cohort_config missing or unrecognized. Without a cohort, you will not be able to make matrices or perform feature imputation.')
            self.state_table_generator = StateTableGeneratorNoOp()

        if 'label_config' in self.config:
            self.label_generator = LabelGenerator(
                label_name=self.config['label_config'].get('name', None),
                query=self.config['label_config']['query'],
                db_engine=self.db_engine,
            )
        else:
            self.label_generator = LabelGeneratorNoOp()
            logging.warning('label_config missing or unrecognized. Without labels, you will not be able to make matrices.')

        self.feature_dictionary_creator = FeatureDictionaryCreator(
            features_schema_name=self.features_schema_name,
            db_engine=self.db_engine,
        )

        self.feature_generator = FeatureGenerator(
            features_schema_name=self.features_schema_name,
            replace=self.replace,
            db_engine=self.db_engine,
            feature_start_time=split_config['feature_start_time']
        )

        self.feature_group_creator = FeatureGroupCreator(
            self.config.get('feature_group_definition', {'all': [True]})
        )

        self.feature_group_mixer = FeatureGroupMixer(
            self.config.get('feature_group_strategies', ['all'])
        )

        self.planner = Planner(
            feature_start_time=dt_from_str(split_config['feature_start_time']),
            label_names=[self.config.get('label_config', {}).get('name', DEFAULT_LABEL_NAME)],
            label_types=['binary'],
            cohort_name=self.config.get('cohort_config', {}).get('name', None),
            states=self.config.get('cohort_config', {}).get('dense_states', {})
            .get('state_filters', []),
            user_metadata=self.config.get('user_metadata', {}),
        )

        self.matrix_builder = MatrixBuilder(
            db_config={
                'features_schema_name': self.features_schema_name,
                'labels_schema_name': 'public',
                'labels_table_name': self.labels_table_name,
                # TODO: have planner/builder take state table later on, so we
                # can grab it from the StateTableGenerator instead of
                # duplicating it here
                'sparse_state_table_name': self.sparse_states_table_name,
            },
            matrix_storage_engine=self.matrix_storage_engine,
            include_missing_labels_in_train_as=self.config.get('label_config', {})
            .get('include_missing_labels_in_train_as', None),
            engine=self.db_engine,
            replace=self.replace
        )

        self.trainer = ModelTrainer(
            experiment_hash=self.experiment_hash,
            model_storage_engine=self.model_storage_engine,
            model_grouper=ModelGrouper(self.config.get('model_group_keys', [])),
            db_engine=self.db_engine,
            replace=self.replace
        )

        self.tester = ModelTester(
            model_storage_engine=self.model_storage_engine,
            matrix_storage_engine=self.matrix_storage_engine,
            replace=self.replace,
            db_engine=self.db_engine,
            individual_importance_config=self.config.get('individual_importance', {}),
            evaluator_config=self.config.get('scoring', {})
        )
コード例 #25
0
    def initialize_components(self):
        split_config = self.config["temporal_config"]

        self.chopper = Timechop(**split_config)

        cohort_config = self.config.get("cohort_config", {})
        if "query" in cohort_config:
            self.cohort_table_generator = CohortTableGenerator(
                cohort_table_name=self.cohort_table_name,
                db_engine=self.db_engine,
                query=cohort_config["query"],
                replace=self.replace
            )
        else:
            logging.warning(
                "cohort_config missing or unrecognized. Without a cohort, "
                "you will not be able to make matrices or perform feature imputation."
            )
            self.cohort_table_generator = CohortTableGeneratorNoOp()

        if "label_config" in self.config:
            self.label_generator = LabelGenerator(
                label_name=self.config["label_config"].get("name", None),
                query=self.config["label_config"]["query"],
                replace=self.replace,
                db_engine=self.db_engine,
            )
        else:
            self.label_generator = LabelGeneratorNoOp()
            logging.warning(
                "label_config missing or unrecognized. Without labels, "
                "you will not be able to make matrices."
            )

        self.feature_dictionary_creator = FeatureDictionaryCreator(
            features_schema_name=self.features_schema_name, db_engine=self.db_engine
        )

        self.feature_generator = FeatureGenerator(
            features_schema_name=self.features_schema_name,
            replace=self.replace,
            db_engine=self.db_engine,
            feature_start_time=split_config["feature_start_time"],
            materialize_subquery_fromobjs=self.materialize_subquery_fromobjs
        )

        self.feature_group_creator = FeatureGroupCreator(
            self.config.get("feature_group_definition", {"all": [True]})
        )

        self.feature_group_mixer = FeatureGroupMixer(
            self.config.get("feature_group_strategies", ["all"])
        )

        self.planner = Planner(
            feature_start_time=dt_from_str(split_config["feature_start_time"]),
            label_names=[
                self.config.get("label_config", {}).get("name", DEFAULT_LABEL_NAME)
            ],
            label_types=["binary"],
            cohort_names=[self.config.get("cohort_config", {}).get("name", None)],
            user_metadata=self.config.get("user_metadata", {}),
        )

        self.matrix_builder = MatrixBuilder(
            db_config={
                "features_schema_name": self.features_schema_name,
                "labels_schema_name": "public",
                "labels_table_name": self.labels_table_name,
                "cohort_table_name": self.cohort_table_name,
            },
            matrix_storage_engine=self.matrix_storage_engine,
            experiment_hash=self.experiment_hash,
            include_missing_labels_in_train_as=self.config.get("label_config", {}).get(
                "include_missing_labels_in_train_as", None
            ),
            engine=self.db_engine,
            replace=self.replace,
        )

        self.trainer = ModelTrainer(
            experiment_hash=self.experiment_hash,
            model_storage_engine=self.model_storage_engine,
            model_grouper=ModelGrouper(self.config.get("model_group_keys", [])),
            db_engine=self.db_engine,
            replace=self.replace,
        )

        self.tester = ModelTester(
            model_storage_engine=self.model_storage_engine,
            matrix_storage_engine=self.matrix_storage_engine,
            replace=self.replace,
            db_engine=self.db_engine,
            individual_importance_config=self.config.get("individual_importance", {}),
            evaluator_config=self.config.get("scoring", {}),
        )