Esempio n. 1
0
    def test_hdf_matrix(self):
        with testing.postgresql.Postgresql() as postgresql:
            # create an engine and generate a table with fake feature data
            engine = create_engine(postgresql.url())
            ensure_db(engine)
            create_schemas(engine=engine,
                           features_tables=features_tables,
                           labels=labels,
                           states=states)

            with get_matrix_storage_engine() as matrix_storage_engine:
                matrix_storage_engine.matrix_storage_class = HDFMatrixStore
                builder = MatrixBuilder(
                    db_config=db_config,
                    matrix_storage_engine=matrix_storage_engine,
                    engine=engine)

                uuid = metta.generate_uuid(self.good_metadata)
                builder.build_matrix(
                    as_of_times=self.good_dates,
                    label_name='booking',
                    label_type='binary',
                    feature_dictionary=self.good_feature_dictionary,
                    matrix_metadata=self.good_metadata,
                    matrix_uuid=uuid,
                    matrix_type='test')

                assert len(matrix_storage_engine.get_store(uuid).matrix) == 5
Esempio n. 2
0
    def test_retry_max(self):
        db_engine = None
        trainer = None
        # set up a basic model training run
        # TODO abstract the setup of a basic model training run where
        # we don't worry about the specific values used? it would make
        # tests like this require a bit less noise to read past
        with testing.postgresql.Postgresql() as postgresql:
            db_engine = create_engine(postgresql.url())
            ensure_db(db_engine)
            init_engine(db_engine)
            trainer = ModelTrainer(
                project_path='econ-dev/inspections',
                experiment_hash=None,
                model_storage_engine=InMemoryModelStorageEngine(
                    project_path=''),
                db_engine=db_engine,
                model_grouper=ModelGrouper())

        # the postgres server goes out of scope here and thus no longer exists
        with patch('time.sleep') as time_mock:
            with self.assertRaises(sqlalchemy.exc.OperationalError):
                trainer.train_models(grid_config(), dict(),
                                     sample_matrix_store())
            # we want to make sure that we are using the retrying module sanely
            # as opposed to matching the exact # of calls specified by the code
            assert len(time_mock.mock_calls) > 5
Esempio n. 3
0
def test_model_grouping_default_config(sample_metadata):
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        ensure_db(engine)
        model_grouper = ModelGrouper()
        # get the basic first model group with our default matrix
        assert (model_grouper.get_model_group_id("module.Classifier",
                                                 {"param1": "val1"},
                                                 sample_metadata, engine) == 1)

        # the end time is not by default a model group key so changing it
        # should still get us the same group
        metadata_new_end_time = copy(sample_metadata)
        metadata_new_end_time["end_time"] = datetime.date(2017, 3, 20)
        assert (model_grouper.get_model_group_id("module.Classifier",
                                                 {"param1": "val1"},
                                                 metadata_new_end_time,
                                                 engine) == 1)

        # max_training_history is a default key,
        # so it should trigger a new group
        metadata_train_history = copy(sample_metadata)
        metadata_train_history["max_training_history"] = "3y"
        assert (model_grouper.get_model_group_id("module.Classifier",
                                                 {"param1": "val1"},
                                                 metadata_train_history,
                                                 engine) == 2)

        # classifier is of course a default key as well
        assert (model_grouper.get_model_group_id("module.OtherClassifier",
                                                 {"param1": "val1"},
                                                 sample_metadata, engine) == 3)
Esempio n. 4
0
def test_n_jobs_not_new_model():
    grid_config = {
        'sklearn.ensemble.AdaBoostClassifier': {
            'n_estimators': [10, 100, 1000]
        },
        'sklearn.ensemble.RandomForestClassifier': {
            'n_estimators': [10, 100],
            'max_features': ['sqrt', 'log2'],
            'max_depth': [5, 10, 15, 20],
            'criterion': ['gini', 'entropy'],
            'n_jobs': [12, 24],
        }
    }

    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        ensure_db(engine)
        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')
            trainer = ModelTrainer(
                project_path='econ-dev/inspections',
                experiment_hash=None,
                model_storage_engine=S3ModelStorageEngine('econ-dev/inspections'),
                db_engine=engine,
                model_group_keys=['label_name', 'label_timespan']
            )

            matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': ['good', 'bad']
            })
            train_tasks = trainer.generate_train_tasks(
                grid_config,
                dict(),
                InMemoryMatrixStore(matrix, {
                    'label_timespan': '1d',
                    'end_time': datetime.datetime.now(),
                    'feature_start_time': datetime.date(2012, 12, 20),
                    'label_name': 'label',
                    'metta-uuid': '1234',
                    'feature_names': ['ft1', 'ft2'],
                    'indices': ['entity_id'],
                })
            )
            assert len(train_tasks) == 35 # 32+3, would be (32*2)+3 if we didn't remove
            assert len([
                task for task in train_tasks
                if 'n_jobs' in task['parameters']
            ]) == 32

            for train_task in train_tasks:
                trainer.process_train_task(**train_task)

            for row in engine.execute(
                'select model_parameters from results.model_groups'
            ):
                assert 'n_jobs' not in row[0]
Esempio n. 5
0
def test_uniform_distribution_entity_id_index():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        init_engine(db_engine)
        model = ModelFactory()
        feature_importances = [
            FeatureImportanceFactory(model_rel=model,
                                     feature='feature_{}'.format(i))
            for i in range(0, 10)
        ]
        data_dict = {'entity_id': [1, 2]}
        for imp in feature_importances:
            data_dict[imp.feature] = [0.5, 0.5]
        test_store = InMemoryMatrixStore(
            matrix=pandas.DataFrame.from_dict(data_dict),
            metadata=sample_metadata())
        session.commit()
        results = uniform_distribution(db_engine,
                                       model_id=model.model_id,
                                       as_of_date='2016-01-01',
                                       test_matrix_store=test_store,
                                       n_ranks=5)

        assert len(results) == 10  # 5 features x 2 entities
        for result in results:
            assert 'entity_id' in result
            assert 'feature_name' in result
            assert 'score' in result
            assert 'feature_value' in result
            assert result['feature_value'] == 0.5
            assert result['score'] >= 0
            assert result['score'] <= 1
            assert isinstance(result['feature_name'], str)
            assert result['entity_id'] in [1, 2]
Esempio n. 6
0
def test_custom_groups(sample_matrix_store, grid_config):
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        ensure_db(engine)
        init_engine(engine)

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')

            MatrixFactory(matrix_uuid="1234")
            session.commit()
            # create training set
            project_path = 'econ-dev/inspections'
            model_storage_engine = S3ModelStorageEngine(project_path)
            trainer = ModelTrainer(
                project_path=project_path,
                experiment_hash=None,
                model_storage_engine=model_storage_engine,
                model_grouper=ModelGrouper(['class_path']),
                db_engine=engine,
            )
            model_ids = trainer.train_models(grid_config=grid_config,
                                             misc_db_parameters=dict(),
                                             matrix_store=sample_matrix_store)
            # expect only one model group now
            records = [
                row[0] for row in engine.execute(
                    'select distinct model_group_id from model_metadata.models'
                )
            ]
            assert len(records) == 1
            assert records[0] == model_ids[0]
Esempio n. 7
0
    def test_test_matrix(self):
        with testing.postgresql.Postgresql() as postgresql:
            # create an engine and generate a table with fake feature data
            engine = create_engine(postgresql.url())
            ensure_db(engine)
            create_schemas(
                engine=engine,
                features_tables=features_tables,
                labels=labels,
                states=states,
            )

            with get_matrix_storage_engine() as matrix_storage_engine:
                builder = MatrixBuilder(
                    db_config=db_config,
                    matrix_storage_engine=matrix_storage_engine,
                    experiment_hash=experiment_hash,
                    engine=engine,
                )

                uuid = filename_friendly_hash(self.good_metadata)
                builder.build_matrix(
                    as_of_times=self.good_dates,
                    label_name="booking",
                    label_type="binary",
                    feature_dictionary=self.good_feature_dictionary,
                    matrix_metadata=self.good_metadata,
                    matrix_uuid=uuid,
                    matrix_type="test",
                )

                assert len(matrix_storage_engine.get_store(uuid).design_matrix) == 5
Esempio n. 8
0
def test_restart_experiment(experiment_class):
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        populate_source_data(db_engine)
        with TemporaryDirectory() as temp_dir:
            experiment = experiment_class(
                config=sample_config(),
                db_engine=db_engine,
                model_storage_class=FSModelStorageEngine,
                project_path=os.path.join(temp_dir, 'inspections'),
            )
            experiment.run()

            evaluations = num_linked_evaluations(db_engine)
            assert evaluations > 0

            experiment = experiment_class(
                config=sample_config(),
                db_engine=db_engine,
                model_storage_class=FSModelStorageEngine,
                project_path=os.path.join(temp_dir, 'inspections'),
                replace=False)
            experiment.make_entity_date_table = mock.Mock()
            experiment.run()
            assert not experiment.make_entity_date_table.called
Esempio n. 9
0
def test_model_grouping_custom_config(sample_metadata):
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        ensure_db(engine)
        model_grouper = ModelGrouper(
            model_group_keys=["feature_names", "as_of_date_frequency"])
        # get the basic first model group with our default matrix
        assert (model_grouper.get_model_group_id("module.Classifier",
                                                 {"param1": "val1"},
                                                 sample_metadata, engine) == 1)

        # classifier is now not a key, so changing it should not get a new id
        assert (model_grouper.get_model_group_id("module.OtherClassifier",
                                                 {"param1": "val1"},
                                                 sample_metadata, engine) == 1)

        # as_of_date_frequency is a key,
        # so it should trigger a new group
        metadata_frequency = copy(sample_metadata)
        metadata_frequency["as_of_date_frequency"] = "2w"
        assert (model_grouper.get_model_group_id("module.Classifier",
                                                 {"param1": "val1"},
                                                 metadata_frequency,
                                                 engine) == 2)

        # testing feature names may seem redundant but it is on a separate
        # code path so make sure its logic works
        metadata_features = copy(sample_metadata)
        metadata_features["feature_names"] = ["ft1", "ft3"]
        assert (model_grouper.get_model_group_id("module.Classifier",
                                                 {"param1": "val1"},
                                                 metadata_features,
                                                 engine) == 3)
Esempio n. 10
0
    def filter_same_train_end_times(self, engine):
        ensure_db(engine)
        init_engine(engine)
        mg1 = ModelGroupFactory(model_group_id=1, model_type='modelType1')
        mg2 = ModelGroupFactory(model_group_id=2, model_type='modelType2')
        mg3 = ModelGroupFactory(model_group_id=3, model_type='modelType3')
        mg4 = ModelGroupFactory(model_group_id=4, model_type='modelType4')
        # model group 1
        ModelFactory(model_group_rel=mg1, train_end_time=datetime(2014, 1, 1))
        ModelFactory(model_group_rel=mg1, train_end_time=datetime(2015, 1, 1))
        ModelFactory(model_group_rel=mg1, train_end_time=datetime(2016, 1, 1))
        ModelFactory(model_group_rel=mg1, train_end_time=datetime(2017, 1, 1))
        # model group 2 only has three timestamps, should not pass
        ModelFactory(model_group_rel=mg2, train_end_time=datetime(2014, 1, 1))
        # model group 3
        ModelFactory(model_group_rel=mg3, train_end_time=datetime(2014, 1, 1))
        ModelFactory(model_group_rel=mg3, train_end_time=datetime(2015, 1, 1))
        ModelFactory(model_group_rel=mg3, train_end_time=datetime(2016, 1, 1))
        ModelFactory(model_group_rel=mg3, train_end_time=datetime(2017, 1, 1))
        # model group 4 only has three timestamps, should not pass
        ModelFactory(model_group_rel=mg4, train_end_time=datetime(2015, 1, 1))
        ModelFactory(model_group_rel=mg4, train_end_time=datetime(2016, 1, 1))

        session.commit()
        train_end_times = [
            '2014-01-01', '2015-01-01', '2016-01-01', '2017-01-01'
        ]
        model_groups = [1, 2, 3, 4]
        model_group_ids = model_groups_filter(
            train_end_times=train_end_times,
            initial_model_group_ids=model_groups,
            models_table='models',
            db_engine=engine)

        return model_group_ids
Esempio n. 11
0
def test_baseline_exception_handling(sample_matrix_store):
    grid_config = {
        'triage.component.catwalk.baselines.rankers.PercentileRankOneFeature':
        {
            'feature': ['feature_one', 'feature_three']
        }
    }
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        project_path = 'econ-dev/inspections'
        model_storage_engine = S3ModelStorageEngine(project_path)
        ensure_db(db_engine)
        init_engine(db_engine)
        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')
            trainer = ModelTrainer(project_path='econ-dev/inspections',
                                   experiment_hash=None,
                                   model_storage_engine=model_storage_engine,
                                   db_engine=db_engine,
                                   model_grouper=ModelGrouper())

            train_tasks = trainer.generate_train_tasks(grid_config, dict(),
                                                       sample_matrix_store)
            # Creates a matrix entry in the matrices table with uuid from train_metadata
            MatrixFactory(matrix_uuid="1234")
            session.commit()

            model_ids = []
            for train_task in train_tasks:
                model_ids.append(trainer.process_train_task(**train_task))
            assert model_ids == [1, None]
Esempio n. 12
0
def test_build_error_cleanup_timeout(_clean_up_mock, experiment_class):
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)

        with TemporaryDirectory() as temp_dir:
            experiment = experiment_class(
                config=sample_config(),
                db_engine=db_engine,
                model_storage_class=FSModelStorageEngine,
                project_path=os.path.join(temp_dir, 'inspections'),
                cleanup=True,
                cleanup_timeout=0.02,  # Set short timeout
            )

            with mock.patch.object(experiment,
                                   'generate_matrices') as build_mock:
                build_mock.side_effect = RuntimeError('boom!')

                with pytest.raises(TimeoutError) as exc_info:
                    experiment()

    # Last exception is TimeoutError, but earlier error is preserved in
    # __context__, and will be noted as well in any standard traceback:
    assert exc_info.value.__context__ is build_mock.side_effect
Esempio n. 13
0
def test_model_grouping_custom_config(sample_metadata):
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        ensure_db(engine)
        model_grouper = ModelGrouper(
            model_group_keys=['feature_names', 'as_of_date_frequency'])
        # get the basic first model group with our default matrix
        assert model_grouper.get_model_group_id('module.Classifier',
                                                {'param1': 'val1'},
                                                sample_metadata, engine) == 1

        # classifier is now not a key, so changing it should not get a new id
        assert model_grouper.get_model_group_id('module.OtherClassifier',
                                                {'param1': 'val1'},
                                                sample_metadata, engine) == 1

        # as_of_date_frequency is a key,
        # so it should trigger a new group
        metadata_frequency = copy(sample_metadata)
        metadata_frequency['as_of_date_frequency'] = '2w'
        assert model_grouper.get_model_group_id('module.Classifier',
                                                {'param1': 'val1'},
                                                metadata_frequency,
                                                engine) == 2

        # testing feature names may seem redundant but it is on a separate
        # code path so make sure its logic works
        metadata_features = copy(sample_metadata)
        metadata_features['feature_names'] = ['ft1', 'ft3']
        assert model_grouper.get_model_group_id('module.Classifier',
                                                {'param1': 'val1'},
                                                metadata_features, engine) == 3
Esempio n. 14
0
    def test_replace_false_rerun(self):
        with testing.postgresql.Postgresql() as postgresql:
            # create an engine and generate a table with fake feature data
            engine = create_engine(postgresql.url())
            ensure_db(engine)
            create_schemas(engine=engine,
                           features_tables=features_tables,
                           labels=labels,
                           states=states)

            dates = [
                datetime.datetime(2016, 1, 1, 0, 0),
                datetime.datetime(2016, 2, 1, 0, 0),
                datetime.datetime(2016, 3, 1, 0, 0)
            ]

            with get_matrix_storage_engine() as matrix_storage_engine:
                builder = MatrixBuilder(
                    db_config=db_config,
                    matrix_storage_engine=matrix_storage_engine,
                    engine=engine,
                    replace=False)

                feature_dictionary = {
                    'features0': ['f1', 'f2'],
                    'features1': ['f3', 'f4'],
                }
                matrix_metadata = {
                    'matrix_id': 'hi',
                    'state': 'state_one AND state_two',
                    'label_name': 'booking',
                    'end_time': datetime.datetime(2016, 3, 1, 0, 0),
                    'feature_start_time': datetime.datetime(2016, 1, 1, 0, 0),
                    'label_timespan': '1 month',
                    'test_duration': '1 month',
                    'indices': ['entity_id', 'as_of_date'],
                }
                uuid = metta.generate_uuid(matrix_metadata)
                builder.build_matrix(as_of_times=dates,
                                     label_name='booking',
                                     label_type='binary',
                                     feature_dictionary=feature_dictionary,
                                     matrix_metadata=matrix_metadata,
                                     matrix_uuid=uuid,
                                     matrix_type='test')

                assert len(matrix_storage_engine.get_store(uuid).matrix) == 5
                # rerun
                builder.make_entity_date_table = Mock()
                builder.build_matrix(as_of_times=dates,
                                     label_name='booking',
                                     label_type='binary',
                                     feature_dictionary=feature_dictionary,
                                     matrix_metadata=matrix_metadata,
                                     matrix_uuid=uuid,
                                     matrix_type='test')
                assert not builder.make_entity_date_table.called
Esempio n. 15
0
def test_experiment_validator():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        populate_source_data(db_engine)
        with mock.patch("triage.util.conf.open",
                        side_effect=open_side_effect) as mock_file:
            ExperimentValidator(db_engine).run(sample_config("query"))
            ExperimentValidator(db_engine).run(sample_config("filepath"))
Esempio n. 16
0
        def replace_db(arg):
            self.new_server = testing.postgresql.Postgresql(port=port)
            db_engine = create_engine(self.new_server.url())
            ensure_db(db_engine)
            init_engine(db_engine)

            # Creates a matrix entry in the matrices table with uuid from train_metadata
            MatrixFactory(matrix_uuid="1234")
            session.commit()
Esempio n. 17
0
def test_save_experiment_and_get_hash():
    # no reason to make assertions on the config itself, use a basic dict
    experiment_config = {"one": "two"}
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        ensure_db(engine)
        exp_hash = save_experiment_and_get_hash(experiment_config, engine)
        assert isinstance(exp_hash, str)
        new_hash = save_experiment_and_get_hash(experiment_config, engine)
        assert new_hash == exp_hash
Esempio n. 18
0
    def test_replace_true_rerun(self):
        with testing.postgresql.Postgresql() as postgresql:
            # create an engine and generate a table with fake feature data
            engine = create_engine(postgresql.url())
            ensure_db(engine)
            create_schemas(
                engine=engine,
                features_tables=features_tables,
                labels=labels,
                states=states,
            )
            matrix_metadata = matrix_metadata_creator(state="active",
                                                      test_duration="1month",
                                                      label_name="booking")

            dates = [
                datetime.datetime(2016, 1, 1, 0, 0),
                datetime.datetime(2016, 2, 1, 0, 0),
                datetime.datetime(2016, 3, 1, 0, 0),
            ]

            feature_dictionary = {
                "features0": ["f1", "f2"],
                "features1": ["f3", "f4"]
            }
            uuid = filename_friendly_hash(matrix_metadata)
            build_args = dict(
                as_of_times=dates,
                label_name="booking",
                label_type="binary",
                feature_dictionary=feature_dictionary,
                matrix_metadata=matrix_metadata,
                matrix_uuid=uuid,
                matrix_type="test",
            )

            with get_matrix_storage_engine() as matrix_storage_engine:
                builder = MatrixBuilder(
                    db_config=db_config,
                    matrix_storage_engine=matrix_storage_engine,
                    experiment_hash=experiment_hash,
                    engine=engine,
                    replace=True,
                )

                builder.build_matrix(**build_args)

                assert len(
                    matrix_storage_engine.get_store(uuid).design_matrix) == 5
                assert builder.sessionmaker().query(Matrix).get(uuid)
                # rerun
                builder.build_matrix(**build_args)
                assert len(
                    matrix_storage_engine.get_store(uuid).design_matrix) == 5
                assert builder.sessionmaker().query(Matrix).get(uuid)
Esempio n. 19
0
def update_ranks_test(predictor,
                      entities_scores_labels,
                      rank_col,
                      expected_result,
                      model_random_seed=12345,
                      need_seed_data=True):
    """Not a test in itself but rather a utility called by many of the ranking tests"""
    ensure_db(predictor.db_engine)
    init_engine(predictor.db_engine)
    model_id = 5
    matrix_uuid = "4567"
    matrix_type = "test"
    as_of_date = datetime.datetime(2012, 1, 1)
    if need_seed_data:
        matrix = MatrixFactory(matrix_uuid=matrix_uuid)
        model = ModelFactory(model_id=model_id, random_seed=model_random_seed)
        for entity_id, score, label in entities_scores_labels:
            PredictionFactory(model_rel=model,
                              matrix_rel=matrix,
                              as_of_date=as_of_date,
                              entity_id=entity_id,
                              score=score,
                              label_value=int(label))
        factory_session.commit()
    predictor.update_db_with_ranks(
        model_id=model_id,
        matrix_uuid=matrix_uuid,
        matrix_type=TestMatrixType,
    )
    ranks = tuple(row for row in predictor.db_engine.execute(
        f'''
select entity_id, {rank_col}::float
from {matrix_type}_results.predictions
where as_of_date = %s and model_id = %s and matrix_uuid = %s order by {rank_col} asc''',
        (as_of_date, model_id, matrix_uuid)))
    assert ranks == expected_result

    # Test that the predictions metadata table is populated
    metadata_records = [
        row for row in predictor.db_engine.execute(
            f"""select tiebreaker_ordering, prediction_metadata.random_seed, models.random_seed
        from {matrix_type}_results.prediction_metadata
        join triage_metadata.models using (model_id)
        join triage_metadata.matrices using (matrix_uuid)
        """)
    ]
    assert len(metadata_records) == 1
    tiebreaker_ordering, random_seed, received_model_random_seed = metadata_records[
        0]
    if tiebreaker_ordering == 'random':
        assert random_seed is model_random_seed
    else:
        assert not random_seed
    assert tiebreaker_ordering == predictor.rank_order
    assert received_model_random_seed == model_random_seed
Esempio n. 20
0
def test_Audition():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        init_engine(db_engine)

        num_model_groups = 10
        model_types = [
            "classifier type {}".format(i) for i in range(0, num_model_groups)
        ]
        model_groups = [
            ModelGroupFactory(model_type=model_type)
            for model_type in model_types
        ]
        train_end_times = [
            datetime(2013, 1, 1),
            datetime(2014, 1, 1),
            datetime(2015, 1, 1),
            datetime(2016, 1, 1),
        ]

        models = [
            ModelFactory(model_group_rel=model_group,
                         train_end_time=train_end_time)
            for model_group in model_groups
            for train_end_time in train_end_times
        ]
        metrics = [
            ("precision@", "100_abs"),
            ("recall@", "100_abs"),
            ("precision@", "50_abs"),
            ("recall@", "50_abs"),
            ("fpr@", "10_pct"),
        ]

        class ImmediateEvalFactory(EvaluationFactory):
            evaluation_start_time = factory.LazyAttribute(
                lambda o: o.model_rel.train_end_time)

        for model in models:
            for (metric, parameter) in metrics:
                ImmediateEvalFactory(model_rel=model,
                                     metric=metric,
                                     parameter=parameter)

        session.commit()

        with tempfile.TemporaryDirectory() as td:
            with mock.patch('os.getcwd') as mock_getcwd:
                mock_getcwd.return_value = td
                AuditionRunner(config_dict=config,
                               db_engine=db_engine,
                               directory=td).run()
                assert len(os.listdir(os.getcwd())) == 6
Esempio n. 21
0
    def test_retry_recovery(self):
        grid_config = {
            'sklearn.ensemble.AdaBoostClassifier': {
                'n_estimators': [10]
            },
        }

        engine = None
        trainer = None
        port = None
        with testing.postgresql.Postgresql() as postgresql:
            port = postgresql.settings['port']
            engine = create_engine(postgresql.url())
            ensure_db(engine)
            trainer = ModelTrainer(
                project_path='econ-dev/inspections',
                experiment_hash=None,
                model_storage_engine=InMemoryModelStorageEngine(project_path=''),
                db_engine=engine,
                model_group_keys=['label_name', 'label_timespan']
            )

            matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': ['good', 'bad']
            })
            matrix_store = InMemoryMatrixStore(matrix, {
                'label_timespan': '1d',
                'end_time': datetime.datetime.now(),
                'feature_start_time': datetime.date(2012, 12, 20),
                'label_name': 'label',
                'metta-uuid': '1234',
                'feature_names': ['ft1', 'ft2'],
                'indices': ['entity_id'],
            })

        # start without a database server
        # then bring it back up after the first sleep
        # use self so it doesn't go out of scope too early and shut down
        self.new_server = None
        def replace_db(arg):
            self.new_server = testing.postgresql.Postgresql(port=port)
            engine = create_engine(self.new_server.url())
            ensure_db(engine)
        with patch('time.sleep') as time_mock:
            time_mock.side_effect = replace_db
            try:
                trainer.train_models(grid_config, dict(), matrix_store)
            finally:
                if self.new_server is not None:
                    self.new_server.stop()
            assert len(time_mock.mock_calls) == 1
Esempio n. 22
0
def rig_engines():
    """Set up a db engine and project storage engine

    Yields (tuple) (database engine, project storage engine)
    """
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        init_engine(db_engine)
        with tempfile.TemporaryDirectory() as temp_dir:
            project_storage = ProjectStorage(temp_dir)
            yield db_engine, project_storage
Esempio n. 23
0
    def __init__(
        self,
        config,
        db_engine,
        model_storage_class=FSModelStorageEngine,
        project_path=None,
        replace=True,
        cleanup=False,
        cleanup_timeout=None,
    ):
        self._check_config_version(config)
        self.config = config

        if isinstance(db_engine, Engine):
            logging.warning(
                'Raw, unserializable SQLAlchemy engine passed. URL will be used, other options may be lost in multi-process environments'
            )
            self.db_engine = create_engine(db_engine.url)
        else:
            self.db_engine = db_engine

        if model_storage_class:
            self.model_storage_engine = model_storage_class(
                project_path=project_path)
        self.matrix_store_class = CSVMatrixStore  # can't be configurable until Architect obeys
        self.project_path = project_path
        self.replace = replace
        ensure_db(self.db_engine)

        self.features_schema_name = 'features'
        if project_path:
            self.matrices_directory = os.path.join(self.project_path,
                                                   'matrices')
            if not os.path.exists(self.matrices_directory):
                os.makedirs(self.matrices_directory)

        self.experiment_hash = save_experiment_and_get_hash(
            self.config, self.db_engine)
        self.labels_table_name = 'labels_{}'.format(self.experiment_hash)
        self.initialize_components()

        self.cleanup = cleanup
        if self.cleanup:
            logging.info(
                'cleanup is set to True, so intermediate tables (labels and states) will be removed after matrix creation'
            )
        else:
            logging.info(
                'cleanup is set to False, so intermediate tables (labels and states) will not be removed after matrix creation'
            )
        self.cleanup_timeout = (self.cleanup_timeout if cleanup_timeout is None
                                else cleanup_timeout)
Esempio n. 24
0
    def test_train_matrix(self):
        with testing.postgresql.Postgresql() as postgresql:
            # create an engine and generate a table with fake feature data
            engine = create_engine(postgresql.url())
            ensure_db(engine)
            create_schemas(engine=engine,
                           features_tables=features_tables,
                           labels=labels,
                           states=states)

            dates = [
                datetime.datetime(2016, 1, 1, 0, 0),
                datetime.datetime(2016, 2, 1, 0, 0),
                datetime.datetime(2016, 3, 1, 0, 0)
            ]

            with TemporaryDirectory() as temp_dir:
                builder = builders.HighMemoryCSVBuilder(
                    db_config=db_config,
                    matrix_directory=temp_dir,
                    engine=engine)
                feature_dictionary = FeatureGroup(name='mygroup',
                                                  features_by_table={
                                                      'features0':
                                                      ['f1', 'f2'],
                                                      'features1':
                                                      ['f3', 'f4'],
                                                  })
                matrix_metadata = {
                    'matrix_id': 'hi',
                    'state': 'state_one AND state_two',
                    'label_name': 'booking',
                    'end_time': datetime.datetime(2016, 3, 1, 0, 0),
                    'feature_start_time': datetime.datetime(2016, 1, 1, 0, 0),
                    'label_timespan': '1 month',
                    'max_training_history': '1 month'
                }
                uuid = metta.generate_uuid(matrix_metadata)
                builder.build_matrix(as_of_times=dates,
                                     label_name='booking',
                                     label_type='binary',
                                     feature_dictionary=feature_dictionary,
                                     matrix_directory=temp_dir,
                                     matrix_metadata=matrix_metadata,
                                     matrix_uuid=uuid,
                                     matrix_type='train')

                matrix_filename = os.path.join(temp_dir, '{}.csv'.format(uuid))
                with open(matrix_filename, 'r') as f:
                    reader = csv.reader(f)
                    assert (len([row for row in reader]) == 6)
Esempio n. 25
0
def test_n_jobs_not_new_model(sample_matrix_store):
    grid_config = {
        'sklearn.ensemble.AdaBoostClassifier': {
            'n_estimators': [10, 100, 1000]
        },
        'sklearn.ensemble.RandomForestClassifier': {
            'n_estimators': [10, 100],
            'max_features': ['sqrt', 'log2'],
            'max_depth': [5, 10, 15, 20],
            'criterion': ['gini', 'entropy'],
            'n_jobs': [12, 24],
        }
    }

    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        init_engine(db_engine)
        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')
            trainer = ModelTrainer(project_path='econ-dev/inspections',
                                   experiment_hash=None,
                                   model_storage_engine=S3ModelStorageEngine(
                                       'econ-dev/inspections'),
                                   db_engine=db_engine,
                                   model_grouper=ModelGrouper())

            train_tasks = trainer.generate_train_tasks(
                grid_config,
                dict(),
                sample_matrix_store,
            )
            # Creates a matrix entry in the matrices table with uuid from train_metadata
            MatrixFactory(matrix_uuid="1234")
            session.commit()

            assert len(train_tasks
                       ) == 35  # 32+3, would be (32*2)+3 if we didn't remove
            assert len([
                task for task in train_tasks if 'n_jobs' in task['parameters']
            ]) == 32

            for train_task in train_tasks:
                trainer.process_train_task(**train_task)

            for row in db_engine.execute(
                    'select model_parameters from model_metadata.model_groups'
            ):
                assert 'n_jobs' not in row[0]
Esempio n. 26
0
def prepare_experiment(config):
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        populate_source_data(db_engine)
        with TemporaryDirectory() as temp_dir:
            experiment = SingleThreadedExperiment(
                config=config,
                db_engine=db_engine,
                model_storage_class=FSModelStorageEngine,
                project_path=os.path.join(temp_dir, 'inspections'),
                cleanup=False
            )
            yield experiment
Esempio n. 27
0
    def test_load_if_right_version(self):
        experiment_config = sample_config()
        experiment_config['config_version'] = CONFIG_VERSION
        with testing.postgresql.Postgresql() as postgresql:
            db_engine = create_engine(postgresql.url())
            ensure_db(db_engine)
            with TemporaryDirectory() as temp_dir:
                experiment = SingleThreadedExperiment(
                    config=experiment_config,
                    db_engine=db_engine,
                    model_storage_class=FSModelStorageEngine,
                    project_path=os.path.join(temp_dir, 'inspections'),
                )

        assert isinstance(experiment, SingleThreadedExperiment)
Esempio n. 28
0
def test_cleanup_timeout(_clean_up_mock, experiment_class):
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        populate_source_data(db_engine)
        with TemporaryDirectory() as temp_dir:
            experiment = experiment_class(
                config=sample_config(),
                db_engine=db_engine,
                model_storage_class=FSModelStorageEngine,
                project_path=os.path.join(temp_dir, 'inspections'),
                cleanup_timeout=0.02,  # Set short timeout
            )
            with pytest.raises(TimeoutError):
                experiment()
Esempio n. 29
0
def test_custom_label_name(experiment_class):
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        config = sample_config()
        config['label_config']['name'] = 'custom_label_name'
        with TemporaryDirectory() as temp_dir:
            experiment = experiment_class(
                config=config,
                db_engine=db_engine,
                model_storage_class=FSModelStorageEngine,
                project_path=os.path.join(temp_dir, 'inspections'),
            )
            assert experiment.label_generator.label_name == 'custom_label_name'
            assert experiment.planner.label_names == ['custom_label_name']
Esempio n. 30
0
def test_prediction_ranks_multiple_dates(project_storage, db_engine):
    """make sure that multiple as-of-dates in a single matrix are handled correctly.
    keep the other variables simple by making no within-date ties that would end up
    testing the tiebreaker logic, just data for two dates with data that could theoretically
    confound a bad ranking method:
    - a different order for entities in both dates
    - each date has some not in the other
    """
    ensure_db(db_engine)
    init_engine(db_engine)
    predictor = Predictor(project_storage.model_storage_engine(), db_engine,
                          'worst')
    model_id = 5
    matrix_uuid = "4567"
    matrix_type = "test"
    entities_dates_and_scores = (
        (23, datetime.datetime(2012, 1, 1), 0.95),
        (34, datetime.datetime(2012, 1, 1), 0.94),
        (45, datetime.datetime(2013, 1, 1), 0.92),
        (23, datetime.datetime(2013, 1, 1), 0.45),
    )
    expected_result = (
        (23, datetime.datetime(2012, 1, 1), 1),
        (34, datetime.datetime(2012, 1, 1), 2),
        (45, datetime.datetime(2013, 1, 1), 3),
        (23, datetime.datetime(2013, 1, 1), 4),
    )
    matrix = MatrixFactory(matrix_uuid=matrix_uuid)
    model = ModelFactory(model_id=model_id)
    for entity_id, as_of_date, score in entities_dates_and_scores:
        PredictionFactory(model_rel=model,
                          matrix_rel=matrix,
                          as_of_date=as_of_date,
                          entity_id=entity_id,
                          score=score)
    factory_session.commit()
    predictor.update_db_with_ranks(
        model_id=model_id,
        matrix_uuid=matrix_uuid,
        matrix_type=TestMatrixType,
    )
    ranks = tuple(row for row in predictor.db_engine.execute(
        f'''
select entity_id, as_of_date, rank_abs_no_ties
from {matrix_type}_results.predictions
where model_id = %s and matrix_uuid = %s order by rank_abs_no_ties''', (
            model_id, matrix_uuid)))
    assert ranks == expected_result