def validate(pipeline_params, model, encoder, track, date_cutoff, max_date): target_name = 'unit_sales' validate_stream = (row for row in stream_data(pipeline_params) if validate_filter(row, date_cutoff, max_date)) encoded_validate_stream = encoder.encode_data_stream(validate_stream) validation_predictions = [float(model.predict([row])) for row in encoded_validate_stream] target = [row[target_name] for row in stream_data(pipeline_params) if validate_filter(row, date_cutoff, max_date)] print("Calculating metrics") validation_metrics = {'r2_score': metrics.r2_score( y_true=target, y_pred=validation_predictions)} track.log_metrics(validation_metrics) fluentd_logger.log('validation_metrics', validation_metrics) write_predictions_and_score(validation_metrics) print("Evaluation done with metrics {}.".format( json.dumps(validation_metrics))) write_model(model) track.log_artifact(file_names['model']) make_validation_plot(target, validation_predictions, track)
def run_ml_model(pipeline_params, encoder, track, date_cutoff, seed=None): target_name = 'unit_sales' train_stream = (row for row in stream_data(pipeline_params) if train_filter(row, date_cutoff)) encoded_train_stream = encoder.encode_data_stream(train_stream) print('Encoding data') # batch step, read it all in encoded_train_data = list(encoded_train_stream) print('Getting target') # read it all in target = [ row[target_name] for row in stream_data(pipeline_params) if train_filter(row, date_cutoff) ] model_name = pipeline_params['model_name'] params = pipeline_params['model_params'][model_name] track.log_ml_params(params) track.log_pipeline_params(pipeline_params) trained_model, params = train_model(encoded_train_data, target, model_name, params, seed=seed) return trained_model, params
def test_get_encoder_from_stream(): stream = stream_data(pipeline_params) stream_small = (next(stream) for _ in range(100)) encoder = get_encoder_from_stream(stream_small) assert isinstance(encoder, OneHotEncoder) stream = stream_data(pipeline_params) stream_small = (next(stream) for _ in range(100)) encoded = list(encoder.encode_data_stream(stream_small)) assert len(encoded) == 100 row_in = { 'id': '88219279', 'date': '2016-08-16', 'item_nbr': '103520', 'unit_sales': '10.0', 'family': 'GROCERY I', 'class': '1028', 'perishable': '0', 'year': '2016', 'month': '8', 'day': '16', 'dayofweek': '1', 'days_til_end_of_data': '364', 'dayoff': 'False' } encoded = encoder.encode_data([row_in]) decoded = encoder.decode_data(encoded) print(decoded[0].keys()) print(row_in.keys()) del row_in['date'], row_in['id'], row_in['unit_sales'] assert decoded[0].keys() == row_in.keys() # make a level not seen row_in['class'] = 'FOO' encoded = encoder.encode_data([row_in]) decoded = encoder.decode_data(encoded) print(decoded[0].keys()) print(row_in.keys()) assert decoded[0].keys() == row_in.keys() print(decoded) assert decoded[0]['class'] == 'UNKNOWN_CATEGORICAL_LEVEL'
def test_stream_data(): stream = stream_data(pipeline_params) row = next(stream) assert isinstance(row, dict) assert 'perishable' in row assert isinstance(row['perishable'], int)
def get_max_date(pipeline_params): # batch step # '2017-08-15' print('Getting max date') max_date = max( date_string_to_date(row["date"]) for row in stream_data(pipeline_params)) print('Max date: %s' % max_date) return max_date.strftime('%Y-%m-%d')