def train_test_models(self, train_matrix_uuid, model_ids_generator, model_storage): predictor = Predictor(project_path=self.project_path, model_storage_engine=model_storage, db_engine=self.db_engine) for trained_model_id in model_ids_generator: ## Prediction log.info('Predict for model_id: {}'.format(trained_model_id)) # Loop over testing as of dates for test_date in self.temporal_split['test_as_of_dates']: # Load matrixes log.info( 'Load test matrix for as of date: {}'.format(test_date)) test_matrix_id = str([ test_date, self.labels, self.temporal_split['prediction_window'] ]) test_metadata = self._make_metadata( datetime.datetime.strptime(test_date, "%Y-%m-%d"), datetime.datetime.strptime(test_date, "%Y-%m-%d"), test_matrix_id, [test_date]) test_df, test_uuid = self.load_store_matrix( test_metadata, [test_date]) misc_db_parameters = {'matrix_uuid': test_uuid} # remove the index from the data-frame for column in test_metadata['indices']: if column in test_df.columns: del test_df[column] # Store matrix test_matrix_store = InMemoryMatrixStore( test_df.iloc[:, :-1], test_metadata, test_df.iloc[:, -1]) predictions_binary, predictions_proba = predictor.predict( trained_model_id, test_matrix_store, misc_db_parameters) ## Evaluation log.info('Generate Evaluations for model_id: {}'.format( trained_model_id)) self.evaluations(predictions_proba, predictions_binary, test_df.iloc[:, -1], trained_model_id, test_date) # remove trained model from memory predictor.delete_model(trained_model_id) return None
def test_predictor_composite_index(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) project_path = 'econ-dev/inspections' model_storage_engine = InMemoryModelStorageEngine(project_path) _, model_id = \ fake_trained_model(project_path, model_storage_engine, db_engine) predictor = Predictor(project_path, model_storage_engine, db_engine) dayone = datetime.datetime(2011, 1, 1) daytwo = datetime.datetime(2011, 1, 2) # create prediction set matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2, 1, 2], 'as_of_date': [dayone, dayone, daytwo, daytwo], 'feature_one': [3, 4, 5, 6], 'feature_two': [5, 6, 7, 8], 'label': [7, 8, 8, 7] }).set_index(['entity_id', 'as_of_date']) metadata = { 'label_name': 'label', 'end_time': AS_OF_DATE, 'label_window': '3month', 'metta-uuid': '1234', } matrix_store = InMemoryMatrixStore(matrix, metadata) predict_proba = predictor.predict(model_id, matrix_store, misc_db_parameters=dict()) # assert # 1. that the returned predictions are of the desired length assert len(predict_proba) == 4 # 2. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute('''select entity_id, as_of_date from results.predictions join results.models using (model_id)''') ] assert len(records) == 4
def test_predictor(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') project_path = 'econ-dev/inspections' model_storage_engine = S3ModelStorageEngine(s3_conn, project_path) _, model_id = \ fake_trained_model(project_path, model_storage_engine, db_engine) predictor = Predictor(project_path, model_storage_engine, db_engine) # create prediction set matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': [7, 8] }).set_index('entity_id') metadata = { 'label_name': 'label', 'end_time': AS_OF_DATE, 'label_window': '3month', 'metta-uuid': '1234', } matrix_store = InMemoryMatrixStore(matrix, metadata) predict_proba = predictor.predict(model_id, matrix_store, misc_db_parameters=dict()) # assert # 1. that the returned predictions are of the desired length assert len(predict_proba) == 2 # 2. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute('''select entity_id, as_of_date from results.predictions join results.models using (model_id)''') ] assert len(records) == 2 # 3. that the contained as_of_dates match what we sent in for record in records: assert record[1].date() == AS_OF_DATE # 4. that the entity ids match the given dataset assert sorted([record[0] for record in records]) == [1, 2] # 5. running with same model_id, different as of date # then with same as of date only replaces the records # with the same date new_matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': [7, 8] }).set_index('entity_id') new_metadata = { 'label_name': 'label', 'end_time': AS_OF_DATE + datetime.timedelta(days=1), 'label_window': '3month', 'metta-uuid': '1234', } new_matrix_store = InMemoryMatrixStore(new_matrix, new_metadata) predictor.predict(model_id, new_matrix_store, misc_db_parameters=dict()) predictor.predict(model_id, matrix_store, misc_db_parameters=dict()) records = [ row for row in db_engine.execute('''select entity_id, as_of_date from results.predictions join results.models using (model_id)''') ] assert len(records) == 4 # 6. That we can delete the model when done prediction on it predictor.delete_model(model_id) assert predictor.load_model(model_id) == None
def test_predictor_retrieve(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) project_path = 'econ-dev/inspections' model_storage_engine = InMemoryModelStorageEngine(project_path) _, model_id = \ fake_trained_model(project_path, model_storage_engine, db_engine) predictor = Predictor(project_path, model_storage_engine, db_engine, replace=False) dayone = datetime.date(2011, 1, 1).isoformat() daytwo = datetime.date(2011, 1, 2).isoformat() # create prediction set matrix_data = { 'entity_id': [1, 2, 1, 2], 'as_of_date': [dayone, dayone, daytwo, daytwo], 'feature_one': [3, 4, 5, 6], 'feature_two': [5, 6, 7, 8], 'label': [7, 8, 8, 7] } matrix = pandas.DataFrame.from_dict(matrix_data)\ .set_index(['entity_id', 'as_of_date']) metadata = { 'label_name': 'label', 'end_time': AS_OF_DATE, 'label_window': '3month', 'metta-uuid': '1234', } matrix_store = InMemoryMatrixStore(matrix, metadata) predict_proba = predictor.predict(model_id, matrix_store, misc_db_parameters=dict()) # When run again, the predictions retrieved from the database # should match. # # Some trickiness here. Let's explain: # # If we are not careful, retrieving predictions from the database and # presenting them as a numpy array can result in a bad ordering, # since the given matrix may not be 'ordered' by some criteria # that can be easily represented by an ORDER BY clause. # # It will sometimes work, because without ORDER BY you will get # it back in the table's physical order, which unless something has # happened to the table will be the order you inserted it, # which could very well be the order in the matrix. # So it's not a bug that would necessarily immediately show itself, # but when it does go wrong your scores will be garbage. # # So we simulate a table order mutation that can happen over time: # Remove the first row and put it at the end. # If the Predictor doesn't explicitly reorder the results, this will fail session = sessionmaker(bind=db_engine)() obj = session.query(Prediction).first() session.delete(obj) session.commit() make_transient(obj) session = sessionmaker(bind=db_engine)() session.add(obj) session.commit() predictor.load_model = Mock() new_predict_proba = predictor.predict(model_id, matrix_store, misc_db_parameters=dict()) assert_array_equal(new_predict_proba, predict_proba) assert not predictor.load_model.called
def test_integration(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') project_path = 'econ-dev/inspections' # create train and test matrices train_matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': [7, 8] }).set_index('entity_id') train_metadata = { 'start_time': datetime.date(2012, 12, 20), 'end_time': datetime.date(2016, 12, 20), 'label_name': 'label', 'prediction_window': '1y', 'feature_names': ['ft1', 'ft2'] } train_store = InMemoryMatrixStore(train_matrix, train_metadata) as_of_dates = [ datetime.date(2016, 12, 21), datetime.date(2017, 1, 21) ] test_stores = [ InMemoryMatrixStore( pandas.DataFrame.from_dict({ 'entity_id': [3], 'feature_one': [8], 'feature_two': [5], 'label': [5] }).set_index('entity_id'), { 'label_name': 'label', 'end_time': as_of_date } ) for as_of_date in as_of_dates ] model_storage_engine = S3ModelStorageEngine(s3_conn, project_path) # instantiate pipeline objects trainer = ModelTrainer( project_path=project_path, model_storage_engine=model_storage_engine, matrix_store=None, db_engine=db_engine, ) predictor = Predictor( project_path, model_storage_engine, db_engine ) model_scorer = ModelScorer( [{'metrics': ['precision@'], 'thresholds': {'top_n': [5]}}], db_engine ) # run the pipeline grid_config = { 'sklearn.linear_model.LogisticRegression': { 'C': [0.00001, 0.0001], 'penalty': ['l1', 'l2'], 'random_state': [2193] } } model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=train_store ) for model_id in model_ids: for as_of_date, test_store in zip(as_of_dates, test_stores): predictions, predictions_proba = predictor.predict( model_id, test_store, misc_db_parameters=dict() ) model_scorer.score( predictions_proba, predictions, test_store.labels(), model_id, as_of_date, as_of_date, '6month' ) # assert # 1. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute('''select entity_id, model_id, as_of_date from results.predictions join results.models using (model_id) order by 3, 2''') ] assert records == [ (3, 1, datetime.datetime(2016, 12, 21)), (3, 2, datetime.datetime(2016, 12, 21)), (3, 3, datetime.datetime(2016, 12, 21)), (3, 4, datetime.datetime(2016, 12, 21)), (3, 1, datetime.datetime(2017, 1, 21)), (3, 2, datetime.datetime(2017, 1, 21)), (3, 3, datetime.datetime(2017, 1, 21)), (3, 4, datetime.datetime(2017, 1, 21)), ] # that evaluations are there records = [ row for row in db_engine.execute(''' select model_id, evaluation_start_time, metric, parameter from results.evaluations order by 2, 1''') ] assert records == [ (1, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (2, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (3, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (4, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (1, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (2, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (3, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (4, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), ]