def test_custom_groups(sample_matrix_store, grid_config):
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        ensure_db(engine)
        init_engine(engine)

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')

            MatrixFactory(matrix_uuid="1234")
            session.commit()
            # create training set
            project_path = 'econ-dev/inspections'
            model_storage_engine = S3ModelStorageEngine(project_path)
            trainer = ModelTrainer(
                project_path=project_path,
                experiment_hash=None,
                model_storage_engine=model_storage_engine,
                model_grouper=ModelGrouper(['class_path']),
                db_engine=engine,
            )
            model_ids = trainer.train_models(grid_config=grid_config,
                                             misc_db_parameters=dict(),
                                             matrix_store=sample_matrix_store)
            # expect only one model group now
            records = [
                row[0] for row in engine.execute(
                    'select distinct model_group_id from model_metadata.models'
                )
            ]
            assert len(records) == 1
            assert records[0] == model_ids[0]
Example #2
0
def get_matrix_store(project_storage,
                     matrix=None,
                     metadata=None,
                     write_to_db=True):
    """Return a matrix store associated with the given project storage.
    Also adds an entry in the matrices table if it doesn't exist already

    Args:
        project_storage (triage.component.catwalk.storage.ProjectStorage) A project's storage
        matrix (dataframe, optional): A matrix to store. Defaults to the output of matrix_creator()
        metadata (dict, optional): matrix metadata.
            defaults to the output of matrix_metadata_creator()
    """
    if matrix is None:
        matrix = matrix_creator()
    if not metadata:
        metadata = matrix_metadata_creator()
    matrix["as_of_date"] = matrix["as_of_date"].apply(pd.Timestamp)
    matrix.set_index(MatrixStore.indices, inplace=True)
    matrix_store = project_storage.matrix_storage_engine().get_store(
        filename_friendly_hash(metadata))
    matrix_store.metadata = metadata
    new_matrix = matrix.copy()
    labels = new_matrix.pop(matrix_store.label_column_name)
    matrix_store.matrix_label_tuple = new_matrix, labels
    matrix_store.save()
    matrix_store.clear_cache()
    if write_to_db:
        if (session.query(Matrix).filter(
                Matrix.matrix_uuid == matrix_store.uuid).count() == 0):
            MatrixFactory(matrix_uuid=matrix_store.uuid)
            session.commit()
    return matrix_store
def test_baseline_exception_handling(sample_matrix_store):
    grid_config = {
        'triage.component.catwalk.baselines.rankers.PercentileRankOneFeature':
        {
            'feature': ['feature_one', 'feature_three']
        }
    }
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        project_path = 'econ-dev/inspections'
        model_storage_engine = S3ModelStorageEngine(project_path)
        ensure_db(db_engine)
        init_engine(db_engine)
        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')
            trainer = ModelTrainer(project_path='econ-dev/inspections',
                                   experiment_hash=None,
                                   model_storage_engine=model_storage_engine,
                                   db_engine=db_engine,
                                   model_grouper=ModelGrouper())

            train_tasks = trainer.generate_train_tasks(grid_config, dict(),
                                                       sample_matrix_store)
            # Creates a matrix entry in the matrices table with uuid from train_metadata
            MatrixFactory(matrix_uuid="1234")
            session.commit()

            model_ids = []
            for train_task in train_tasks:
                model_ids.append(trainer.process_train_task(**train_task))
            assert model_ids == [1, None]
        def replace_db(arg):
            self.new_server = testing.postgresql.Postgresql(port=port)
            db_engine = create_engine(self.new_server.url())
            ensure_db(db_engine)
            init_engine(db_engine)

            # Creates a matrix entry in the matrices table with uuid from train_metadata
            MatrixFactory(matrix_uuid="1234")
            session.commit()
Example #5
0
def update_ranks_test(predictor,
                      entities_scores_labels,
                      rank_col,
                      expected_result,
                      model_random_seed=12345,
                      need_seed_data=True):
    """Not a test in itself but rather a utility called by many of the ranking tests"""
    ensure_db(predictor.db_engine)
    init_engine(predictor.db_engine)
    model_id = 5
    matrix_uuid = "4567"
    matrix_type = "test"
    as_of_date = datetime.datetime(2012, 1, 1)
    if need_seed_data:
        matrix = MatrixFactory(matrix_uuid=matrix_uuid)
        model = ModelFactory(model_id=model_id, random_seed=model_random_seed)
        for entity_id, score, label in entities_scores_labels:
            PredictionFactory(model_rel=model,
                              matrix_rel=matrix,
                              as_of_date=as_of_date,
                              entity_id=entity_id,
                              score=score,
                              label_value=int(label))
        factory_session.commit()
    predictor.update_db_with_ranks(
        model_id=model_id,
        matrix_uuid=matrix_uuid,
        matrix_type=TestMatrixType,
    )
    ranks = tuple(row for row in predictor.db_engine.execute(
        f'''
select entity_id, {rank_col}::float
from {matrix_type}_results.predictions
where as_of_date = %s and model_id = %s and matrix_uuid = %s order by {rank_col} asc''',
        (as_of_date, model_id, matrix_uuid)))
    assert ranks == expected_result

    # Test that the predictions metadata table is populated
    metadata_records = [
        row for row in predictor.db_engine.execute(
            f"""select tiebreaker_ordering, prediction_metadata.random_seed, models.random_seed
        from {matrix_type}_results.prediction_metadata
        join triage_metadata.models using (model_id)
        join triage_metadata.matrices using (matrix_uuid)
        """)
    ]
    assert len(metadata_records) == 1
    tiebreaker_ordering, random_seed, received_model_random_seed = metadata_records[
        0]
    if tiebreaker_ordering == 'random':
        assert random_seed is model_random_seed
    else:
        assert not random_seed
    assert tiebreaker_ordering == predictor.rank_order
    assert received_model_random_seed == model_random_seed
Example #6
0
def test_n_jobs_not_new_model(sample_matrix_store):
    grid_config = {
        'sklearn.ensemble.AdaBoostClassifier': {
            'n_estimators': [10, 100, 1000]
        },
        'sklearn.ensemble.RandomForestClassifier': {
            'n_estimators': [10, 100],
            'max_features': ['sqrt', 'log2'],
            'max_depth': [5, 10, 15, 20],
            'criterion': ['gini', 'entropy'],
            'n_jobs': [12, 24],
        }
    }

    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        init_engine(db_engine)
        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')
            trainer = ModelTrainer(project_path='econ-dev/inspections',
                                   experiment_hash=None,
                                   model_storage_engine=S3ModelStorageEngine(
                                       'econ-dev/inspections'),
                                   db_engine=db_engine,
                                   model_grouper=ModelGrouper())

            train_tasks = trainer.generate_train_tasks(
                grid_config,
                dict(),
                sample_matrix_store,
            )
            # Creates a matrix entry in the matrices table with uuid from train_metadata
            MatrixFactory(matrix_uuid="1234")
            session.commit()

            assert len(train_tasks
                       ) == 35  # 32+3, would be (32*2)+3 if we didn't remove
            assert len([
                task for task in train_tasks if 'n_jobs' in task['parameters']
            ]) == 32

            for train_task in train_tasks:
                trainer.process_train_task(**train_task)

            for row in db_engine.execute(
                    'select model_parameters from model_metadata.model_groups'
            ):
                assert 'n_jobs' not in row[0]
Example #7
0
def test_prediction_ranks_multiple_dates(project_storage, db_engine):
    """make sure that multiple as-of-dates in a single matrix are handled correctly.
    keep the other variables simple by making no within-date ties that would end up
    testing the tiebreaker logic, just data for two dates with data that could theoretically
    confound a bad ranking method:
    - a different order for entities in both dates
    - each date has some not in the other
    """
    ensure_db(db_engine)
    init_engine(db_engine)
    predictor = Predictor(project_storage.model_storage_engine(), db_engine,
                          'worst')
    model_id = 5
    matrix_uuid = "4567"
    matrix_type = "test"
    entities_dates_and_scores = (
        (23, datetime.datetime(2012, 1, 1), 0.95),
        (34, datetime.datetime(2012, 1, 1), 0.94),
        (45, datetime.datetime(2013, 1, 1), 0.92),
        (23, datetime.datetime(2013, 1, 1), 0.45),
    )
    expected_result = (
        (23, datetime.datetime(2012, 1, 1), 1),
        (34, datetime.datetime(2012, 1, 1), 2),
        (45, datetime.datetime(2013, 1, 1), 3),
        (23, datetime.datetime(2013, 1, 1), 4),
    )
    matrix = MatrixFactory(matrix_uuid=matrix_uuid)
    model = ModelFactory(model_id=model_id)
    for entity_id, as_of_date, score in entities_dates_and_scores:
        PredictionFactory(model_rel=model,
                          matrix_rel=matrix,
                          as_of_date=as_of_date,
                          entity_id=entity_id,
                          score=score)
    factory_session.commit()
    predictor.update_db_with_ranks(
        model_id=model_id,
        matrix_uuid=matrix_uuid,
        matrix_type=TestMatrixType,
    )
    ranks = tuple(row for row in predictor.db_engine.execute(
        f'''
select entity_id, as_of_date, rank_abs_no_ties
from {matrix_type}_results.predictions
where model_id = %s and matrix_uuid = %s order by rank_abs_no_ties''', (
            model_id, matrix_uuid)))
    assert ranks == expected_result
Example #8
0
def test_predictor_get_train_columns():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        init_engine(db_engine)

        project_path = 'econ-dev/inspections'
        with tempfile.TemporaryDirectory() as temp_dir:
            train_store, test_store = sample_metta_csv_diff_order(temp_dir)

            model_storage_engine = InMemoryModelStorageEngine(project_path)
            _, model_id = \
                fake_trained_model(
                    project_path,
                    model_storage_engine,
                    db_engine,
                    train_matrix_uuid=train_store.uuid
                )
            predictor = Predictor(project_path, model_storage_engine,
                                  db_engine)

            # The train_store uuid is stored in fake_trained_model. Storing the other
            MatrixFactory(matrix_uuid=test_store.uuid)
            session.commit()

            # Runs the same test for training and testing predictions
            for store, mat_type in zip((train_store, test_store),
                                       ("train", "test")):
                predict_proba = predictor.predict(
                    model_id,
                    store,
                    misc_db_parameters=dict(),
                    train_matrix_columns=train_store.columns())
                # assert
                # 1. that we calculated predictions
                assert len(predict_proba) > 0

                # 2. that the predictions table entries are present and
                # can be linked to the original models
                records = [
                    row for row in db_engine.execute(
                        '''select entity_id, as_of_date
                    from {}_results.{}_predictions
                    join model_metadata.models using (model_id)'''.format(
                            mat_type, mat_type))
                ]
                assert len(records) > 0
Example #9
0
def get_matrix_store(project_storage, matrix=None, metadata=None):
    """Return a matrix store associated with the given project storage. Also adds an entry in the matrices table if it doesn't exist already

    Args:
        project_storage (triage.component.catwalk.storage.ProjectStorage) A project's storage
        matrix (dataframe, optional): A matrix to store. Defaults to the output of matrix_creator()
        metadata (dict, optional): matrix metadata. defaults to the output of matrix_metadata_creator()
    """
    if matrix is None:
        matrix = matrix_creator()
    if not metadata:
        metadata = matrix_metadata_creator()
    matrix_store = project_storage.matrix_storage_engine().get_store(metadata['metta-uuid'])
    matrix_store.matrix = matrix
    matrix_store.metadata = metadata
    matrix_store.save()
    if session.query(Matrix).filter(Matrix.matrix_uuid == matrix_store.uuid).count() == 0:
        MatrixFactory(matrix_uuid=matrix_store.uuid)
        session.commit()
    return matrix_store
Example #10
0
def test_model_trainer(sample_matrix_store, grid_config):
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        init_engine(db_engine)

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')

            # Creates a matrix entry in the matrices table with uuid from metadata above
            MatrixFactory(matrix_uuid="1234")
            session.commit()
            project_path = 'econ-dev/inspections'
            model_storage_engine = S3ModelStorageEngine(project_path)
            trainer = ModelTrainer(
                project_path=project_path,
                experiment_hash=None,
                model_storage_engine=model_storage_engine,
                model_grouper=ModelGrouper(),
                db_engine=db_engine,
            )
            model_ids = trainer.train_models(grid_config=grid_config,
                                             misc_db_parameters=dict(),
                                             matrix_store=sample_matrix_store)

            # assert
            # 1. that the models and feature importances table entries are present
            records = [
                row for row in db_engine.execute(
                    'select * from train_results.feature_importances')
            ]
            assert len(records) == 4 * 2  # maybe exclude entity_id? yes

            records = [
                row for row in db_engine.execute(
                    'select model_hash from model_metadata.models')
            ]
            assert len(records) == 4
            hashes = [row[0] for row in records]

            # 2. that the model groups are distinct
            records = [
                row for row in db_engine.execute(
                    'select distinct model_group_id from model_metadata.models'
                )
            ]
            assert len(records) == 4

            # 3. that the model sizes are saved in the table and all are < 1 kB
            records = [
                row for row in db_engine.execute(
                    'select model_size from model_metadata.models')
            ]
            assert len(records) == 4
            for i in records:
                size = i[0]
                assert size < 1

            # 4. that all four models are cached
            model_pickles = [
                model_storage_engine.get_store(model_hash).load()
                for model_hash in hashes
            ]
            assert len(model_pickles) == 4
            assert len([x for x in model_pickles if x is not None]) == 4

            # 5. that their results can have predictions made on it
            test_matrix = pandas.DataFrame.from_dict({
                'entity_id': [3, 4],
                'feature_one': [4, 4],
                'feature_two': [6, 5],
            })

            test_matrix = InMemoryMatrixStore(matrix=test_matrix, metadata=sample_metadata())\
                .matrix

            for model_pickle in model_pickles:
                predictions = model_pickle.predict(test_matrix)
                assert len(predictions) == 2

            # 6. when run again, same models are returned
            new_model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=sample_matrix_store)
            assert len([
                row for row in db_engine.execute(
                    'select model_hash from model_metadata.models')
            ]) == 4
            assert model_ids == new_model_ids

            # 7. if replace is set, update non-unique attributes and feature importances
            max_batch_run_time = [
                row[0] for row in db_engine.execute(
                    'select max(batch_run_time) from model_metadata.models')
            ][0]
            trainer = ModelTrainer(
                project_path=project_path,
                experiment_hash=None,
                model_storage_engine=model_storage_engine,
                model_grouper=ModelGrouper(
                    model_group_keys=['label_name', 'label_timespan']),
                db_engine=db_engine,
                replace=True)
            new_model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=sample_matrix_store,
            )
            assert model_ids == new_model_ids
            assert [
                row['model_id'] for row in db_engine.execute(
                    'select model_id from model_metadata.models order by 1 asc'
                )
            ] == model_ids
            new_max_batch_run_time = [
                row[0] for row in db_engine.execute(
                    'select max(batch_run_time) from model_metadata.models')
            ][0]
            assert new_max_batch_run_time > max_batch_run_time

            records = [
                row for row in db_engine.execute(
                    'select * from train_results.feature_importances')
            ]
            assert len(records) == 4 * 2  # maybe exclude entity_id? yes

            # 8. if the cache is missing but the metadata is still there, reuse the metadata
            for row in db_engine.execute(
                    'select model_hash from model_metadata.models'):
                model_storage_engine.get_store(row[0]).delete()
            new_model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=sample_matrix_store)
            assert model_ids == sorted(new_model_ids)

            # 9. that the generator interface works the same way
            new_model_ids = trainer.generate_trained_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=sample_matrix_store)
            assert model_ids == \
                sorted([model_id for model_id in new_model_ids])
Example #11
0
def test_integration():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        init_engine(db_engine)

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')
            project_path = 'econ-dev/inspections'

            # create train and test matrices
            train_matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': [7, 8]
            }).set_index('entity_id')
            train_metadata = {
                'feature_start_time': datetime.date(2012, 12, 20),
                'end_time': datetime.date(2016, 12, 20),
                'label_name': 'label',
                'label_timespan': '1y',
                'feature_names': ['ft1', 'ft2'],
                'metta-uuid': '1234',
                'indices': ['entity_id'],
                'matrix_type': 'train'
            }
            # Creates a matrix entry in the matrices table with uuid from train_metadata
            MatrixFactory(matrix_uuid="1234")
            session.commit()

            train_store = InMemoryMatrixStore(train_matrix, sample_metadata())

            as_of_dates = [
                datetime.date(2016, 12, 21),
                datetime.date(2017, 1, 21)
            ]

            test_stores = [
                InMemoryMatrixStore(
                    pandas.DataFrame.from_dict({
                        'entity_id': [3],
                        'feature_one': [8],
                        'feature_two': [5],
                        'label': [5]
                    }), {
                        'label_name': 'label',
                        'label_timespan': '1y',
                        'end_time': as_of_date,
                        'metta-uuid': '1234',
                        'indices': ['entity_id'],
                        'matrix_type': 'test',
                        'as_of_date_frequency': '1month'
                    }) for as_of_date in as_of_dates
            ]

            model_storage_engine = S3ModelStorageEngine(project_path)

            experiment_hash = save_experiment_and_get_hash({}, db_engine)
            # instantiate pipeline objects
            trainer = ModelTrainer(
                project_path=project_path,
                experiment_hash=experiment_hash,
                model_storage_engine=model_storage_engine,
                db_engine=db_engine,
            )
            predictor = Predictor(project_path, model_storage_engine,
                                  db_engine)
            model_evaluator = ModelEvaluator([{
                'metrics': ['precision@'],
                'thresholds': {
                    'top_n': [5]
                }
            }], [{}], db_engine)

            # run the pipeline
            grid_config = {
                'sklearn.linear_model.LogisticRegression': {
                    'C': [0.00001, 0.0001],
                    'penalty': ['l1', 'l2'],
                    'random_state': [2193]
                }
            }
            model_ids = trainer.train_models(grid_config=grid_config,
                                             misc_db_parameters=dict(),
                                             matrix_store=train_store)

            for model_id in model_ids:
                for as_of_date, test_store in zip(as_of_dates, test_stores):
                    predictions_proba = predictor.predict(
                        model_id,
                        test_store,
                        misc_db_parameters=dict(),
                        train_matrix_columns=['feature_one', 'feature_two'])

                    model_evaluator.evaluate(
                        predictions_proba,
                        test_store,
                        model_id,
                    )

            # assert
            # 1. that the predictions table entries are present and
            # can be linked to the original models
            records = [
                row for row in db_engine.execute(
                    '''select entity_id, model_id, as_of_date
                from test_results.test_predictions
                join model_metadata.models using (model_id)
                order by 3, 2''')
            ]
            assert records == [
                (3, 1, datetime.datetime(2016, 12, 21)),
                (3, 2, datetime.datetime(2016, 12, 21)),
                (3, 3, datetime.datetime(2016, 12, 21)),
                (3, 4, datetime.datetime(2016, 12, 21)),
                (3, 1, datetime.datetime(2017, 1, 21)),
                (3, 2, datetime.datetime(2017, 1, 21)),
                (3, 3, datetime.datetime(2017, 1, 21)),
                (3, 4, datetime.datetime(2017, 1, 21)),
            ]

            # that evaluations are there
            records = [
                row for row in db_engine.execute('''
                    select model_id, evaluation_start_time, metric, parameter
                    from test_results.test_evaluations order by 2, 1''')
            ]
            assert records == [
                (1, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (2, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (3, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (4, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (1, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (2, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (3, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (4, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
            ]