def main(args): config_path = f"{args.predictor_dir}/{args.model_name}.json" net_path = f"{args.predictor_dir}/{args.model_name}.pt" predictor = BanditPredictor.predictor_from_file(config_path, net_path) json_input = json.dumps({"year": 2019, "country": 4}) start = time.time() decisions = get_decisions(json_input, predictor, args.get_ucb_scores) end = time.time() logger.info(f"Prediction request took {round(end - start, 5)} seconds.") logger.info(f"Predictions: {decisions}") if args.get_exploration_decision: logger.info( f"Single exploitation/exploration decision: {get_single_decision(decisions)}" )
def main(args): config_path = f"{args.predictor_dir}/{args.model_name}.json" net_path = f"{args.predictor_dir}/{args.model_name}.pt" predictor = BanditPredictor.predictor_from_file(config_path, net_path) start = time.time() choices = args.choices or None decisions = get_decisions(args.context, args.choices, predictor, args.get_ucb_scores) end = time.time() logger.info(f"Prediction request took {round(end - start, 5)} seconds.") logger.info(f"Predictions: {decisions}") if args.get_exploration_decision: logger.info( f"Single exploitation/exploration decision: {get_single_decision(decisions)}" )
def test_same_predictions_country_as_categorical(self): raw_data = shuffle(Datasets._raw_data) rand_idx = 0 test_input = raw_data.iloc[rand_idx] data = preprocessor.preprocess_data( raw_data, self.ml_params["data_reader"]["reward_function"], Params.EXPERIMENT_SPECIFIC_PARAMS_COUNTRY_AS_CATEGORICAL, shuffle_data=False, # don't shuffle so we can test the same observation ) _X, _y = preprocessor.data_to_pytorch(data) X_COUNTRY_CATEG = { "X_train": {"X_float": _X["X_float"][: Datasets._offset]}, "y_train": _y[: Datasets._offset], "X_test": {"X_float": _X["X_float"][Datasets._offset :]}, "y_test": _y[Datasets._offset :], } net_spec, pytorch_net = train_bandit.build_pytorch_net( feature_specs=Params.EXPERIMENT_SPECIFIC_PARAMS_COUNTRY_AS_CATEGORICAL[ "features" ], product_sets=Params.EXPERIMENT_SPECIFIC_PARAMS_COUNTRY_AS_CATEGORICAL[ "product_sets" ], float_feature_order=Datasets.DATA_COUNTRY_CATEG[ "final_float_feature_order" ], id_feature_order=Datasets.DATA_COUNTRY_CATEG["final_id_feature_order"], layers=self.ml_params["model"]["layers"], activations=self.ml_params["model"]["activations"], input_dim=train_bandit.num_float_dim(Datasets.DATA_COUNTRY_CATEG), ) pre_serialized_predictor = BanditPredictor( experiment_params=Params.EXPERIMENT_SPECIFIC_PARAMS_COUNTRY_AS_CATEGORICAL, float_feature_order=Datasets.DATA_COUNTRY_CATEG["float_feature_order"], id_feature_order=Datasets.DATA_COUNTRY_CATEG["id_feature_order"], id_feature_str_to_int_map=Datasets.DATA_COUNTRY_CATEG[ "id_feature_str_to_int_map" ], transforms=Datasets.DATA_COUNTRY_CATEG["transforms"], imputers=Datasets.DATA_COUNTRY_CATEG["imputers"], net=pytorch_net, net_spec=net_spec, ) skorch_net = train_bandit.fit_custom_pytorch_module_w_skorch( module=pre_serialized_predictor.net, X=X_COUNTRY_CATEG["X_train"], y=X_COUNTRY_CATEG["y_train"], hyperparams=self.ml_params, ) pre_serialized_predictor.config_to_file(self.tmp_config_path) pre_serialized_predictor.net_to_file(self.tmp_net_path) post_serialized_predictor = BanditPredictor.predictor_from_file( self.tmp_config_path, self.tmp_net_path ) pre_pred = pre_serialized_predictor.predict(json.loads(test_input.context)) post_pred = post_serialized_predictor.predict(json.loads(test_input.context)) assert np.allclose(pre_pred["scores"], post_pred["scores"], self.tol) assert pre_pred["ids"] == post_pred["ids"]
def train( ml_params: Dict, experiment_params: Dict, predictor_save_dir: str = None, s3_bucket_to_write_to: str = None, ): logger.info("Initializing data reader...") data_reader = BigQueryReader( credential_path=ml_params["data_reader"]["credential_path"], bq_project=ml_params["data_reader"]["bq_project"], bq_dataset=ml_params["data_reader"]["bq_dataset"], decisions_ds_start=ml_params["data_reader"]["decisions_ds_start"], decisions_ds_end=ml_params["data_reader"]["decisions_ds_end"], rewards_ds_end=ml_params["data_reader"]["rewards_ds_end"], reward_function=ml_params["data_reader"]["reward_function"], experiment_id=experiment_params["experiment_id"], ) raw_data = data_reader.get_training_data() if len(raw_data) == 0: logger.error(f"Got no raws of training data. Training aborted.") sys.exit() logger.info(f"Got {len(raw_data)} rows of training data.") logger.info(raw_data.head()) utils.fancy_print("Kicking off data preprocessing") # always add decision as a feature to use if not using all features features_to_use = ml_params["data_reader"].get("features_to_use", ["*"]) if features_to_use != ["*"]: features_to_use.append(preprocessor.DECISION_FEATURE_NAME) features_to_use = list(set(features_to_use)) dense_features_to_use = ml_params["data_reader"].get("dense_features_to_use", ["*"]) data = preprocessor.preprocess_data( raw_data, experiment_params, ml_params["reward_type"], features_to_use, dense_features_to_use, ) X, y = preprocessor.data_to_pytorch(data) model_type = ml_params["model_type"] model_params = ml_params["model_params"][model_type] reward_type = ml_params["reward_type"] feature_importance_params = ml_params.get("feature_importance", {}) if feature_importance_params.get("calc_feature_importance", False): # calculate feature importances - only works on non id list features at this time utils.fancy_print("Calculating feature importances") feature_scores = feature_importance.calculate_feature_importance( reward_type=reward_type, feature_names=data["final_float_feature_order"], X=X, y=y, ) feature_importance.display_feature_importances(feature_scores) # TODO: Make keeping the top "n" features work in predictor. Right now # using this feature breaks predictor, so don't use it in a final model, # just use it to experiment in seeing how model performance is. if feature_importance_params.get("keep_only_top_n", False): utils.fancy_print("Keeping only top N features") X, final_float_feature_order = feature_importance.keep_top_n_features( n=feature_importance_params["n"], X=X, feature_order=data["final_float_feature_order"], feature_scores=feature_scores, ) data["final_float_feature_order"] = final_float_feature_order logger.info(f"Keeping top {feature_importance_params['n']} features:") logger.info(final_float_feature_order) utils.fancy_print("Starting training") # build the model if model_type == "neural_bandit": model_spec, model = model_constructors.build_pytorch_net( feature_specs=experiment_params["features"], product_sets=experiment_params["product_sets"], float_feature_order=data["final_float_feature_order"], id_feature_order=data["final_id_feature_order"], reward_type=reward_type, layers=model_params["layers"], activations=model_params["activations"], dropout_ratio=model_params["dropout_ratio"], input_dim=num_float_dim(data), ) logger.info(f"Initialized model: {model}") elif model_type == "linear_bandit": assert utils.pset_features_have_dense(experiment_params["features"]), ( "Linear models require that product set features have associated" "dense representations." ) model = model_constructors.build_linear_model( reward_type=reward_type, penalty=model_params.get("penalty"), alpha=model_params.get("alpha"), ) model_spec = None elif model_type == "gbdt_bandit": assert utils.pset_features_have_dense(experiment_params["features"]), ( "GBDT models require that product set features have associated" "dense representations." ) model = model_constructors.build_gbdt( reward_type=reward_type, learning_rate=model_params["learning_rate"], n_estimators=model_params["n_estimators"], max_depth=model_params["max_depth"], ) model_spec = None elif model_type == "random_forest_bandit": assert utils.pset_features_have_dense(experiment_params["features"]), ( "Random forest models require that product set features have associated" "dense representations." ) model = model_constructors.build_random_forest( reward_type=reward_type, n_estimators=model_params["n_estimators"], max_depth=model_params["max_depth"], ) model_spec = None # build the predictor predictor = BanditPredictor( experiment_params=experiment_params, float_feature_order=data["float_feature_order"], id_feature_order=data["id_feature_order"], id_feature_str_to_int_map=data["id_feature_str_to_int_map"], transforms=data["transforms"], imputers=data["imputers"], model=model, model_type=model_type, reward_type=reward_type, model_spec=model_spec, dense_features_to_use=dense_features_to_use, ) # train the model if model_type == "neural_bandit": logger.info(f"Training {model_type} for {model_params['max_epochs']} epochs") skorch_net = model_trainers.fit_custom_pytorch_module_w_skorch( reward_type=reward_type, model=predictor.model, X=X, y=y, hyperparams=model_params, train_percent=ml_params["train_percent"], ) elif model_type in ("gbdt_bandit", "random_forest_bandit", "linear_bandit"): logger.info(f"Training {model_type}") sklearn_model, _ = model_trainers.fit_sklearn_model( reward_type=reward_type, model=model, X=X, y=y, train_percent=ml_params["train_percent"], ) if predictor_save_dir is not None: logger.info("Saving predictor artifacts to disk...") experiment_id = experiment_params.get("experiment_id", "test") model_name = ml_params.get("model_name", "model") save_dir = f"{predictor_save_dir}/{experiment_id}" if not os.path.exists(save_dir): os.makedirs(save_dir) predictor_net_path = f"{save_dir}/{model_name}.pt" predictor_config_path = f"{save_dir}/{model_name}.json" predictor.config_to_file(predictor_config_path) predictor.model_to_file(predictor_net_path) if s3_bucket_to_write_to is not None: logger.info("Writing predictor artifacts to s3...") # Assumes aws credentials stored in ~/.aws/credentials that looks like: # [default] # aws_access_key_id = YOUR_ACCESS_KEY # aws_secret_access_key = YOUR_SECRET_KEY dir_to_zip = save_dir output_path = save_dir shutil.make_archive(output_path, "zip", dir_to_zip) s3_client = boto3.client("s3") s3_client.upload_file( Filename=f"{output_path}.zip", Bucket=s3_bucket_to_write_to, Key=f"{experiment_id}.zip", )
def train( ml_params: Dict, experiment_params: Dict, model_name: str = None, predictor_save_dir: str = None, s3_bucket_to_write_to: str = None, ): logger.info("Initializing data reader...") data_reader = BigQueryReader( credential_path=ml_params["data_reader"]["credential_path"], bq_project=ml_params["data_reader"]["bq_project"], bq_dataset=ml_params["data_reader"]["bq_dataset"], decisions_table=ml_params["data_reader"]["decisions_table"], rewards_table=ml_params["data_reader"]["rewards_table"], decisions_ds_start=ml_params["data_reader"]["decisions_ds_start"], decisions_ds_end=ml_params["data_reader"]["decisions_ds_end"], rewards_ds_end=ml_params["data_reader"]["rewards_ds_end"], experiment_id=experiment_params["experiment_id"], ) raw_data = data_reader.get_training_data() if len(raw_data) == 0: logger.error(f"Got no raws of training data. Training aborted.") sys.exit() logger.info(f"Got {len(raw_data)} rows of training data.") logger.info(raw_data.head()) logger.info("Kicking off data preprocessing...") data = preprocessor.preprocess_data( raw_data, ml_params["data_reader"]["reward_function"], experiment_params) X, y = preprocessor.data_to_pytorch(data) model_type = ml_params["model_type"] model_params = ml_params["model_params"][model_type] reward_type = ml_params["reward_type"] # build the model if model_type == "neural_bandit": model_spec, model = model_constructors.build_pytorch_net( feature_specs=experiment_params["features"], product_sets=experiment_params["product_sets"], float_feature_order=data["final_float_feature_order"], id_feature_order=data["final_id_feature_order"], reward_type=reward_type, layers=model_params["layers"], activations=model_params["activations"], dropout_ratio=model_params["dropout_ratio"], input_dim=num_float_dim(data), ) logger.info(f"Initialized model: {model}") elif model_type == "gbdt_bandit": assert utils.pset_features_have_dense(experiment_params["features"]), ( "GBDT models require that product set features have associated" "dense reprenstations.") model = model_constructors.build_gbdt( reward_type=reward_type, learning_rate=model_params["learning_rate"], n_estimators=model_params["n_estimators"], max_depth=model_params["max_depth"], ) model_spec = None elif model_type == "random_forest_bandit": assert utils.pset_features_have_dense(experiment_params["features"]), ( "Random forest models require that product set features have associated" "dense reprenstations.") model = model_constructors.build_random_forest( reward_type=reward_type, n_estimators=model_params["n_estimators"], max_depth=model_params["max_depth"], ) model_spec = None # build the predictor predictor = BanditPredictor( experiment_params=experiment_params, float_feature_order=data["float_feature_order"], id_feature_order=data["id_feature_order"], id_feature_str_to_int_map=data["id_feature_str_to_int_map"], transforms=data["transforms"], imputers=data["imputers"], model=model, reward_type=reward_type, model_spec=model_spec, ) # train the model if model_type == "neural_bandit": logger.info(f"Starting training: {model_params} epochs") skorch_net = model_trainers.fit_custom_pytorch_module_w_skorch( reward_type=reward_type, model=predictor.model, X=X, y=y, hyperparams=model_params, train_percent=ml_params["train_percent"], ) elif model_type in ("gbdt_bandit", "random_forest_bandit"): logger.info(f"Starting training: {model_type}") sklearn_model, _ = model_trainers.fit_sklearn_model( reward_type=reward_type, model=model, X=X, y=y, train_percent=ml_params["train_percent"], ) if predictor_save_dir is not None: logger.info("Saving predictor artifacts to disk...") experiment_id = experiment_params.get("experiment_id", "test") model_name = experiment_params.get("model_name", "model") save_dir = f"{predictor_save_dir}/{experiment_id}" if not os.path.exists(save_dir): os.makedirs(save_dir) predictor_net_path = f"{save_dir}/{model_name}.pt" predictor_config_path = f"{save_dir}/{model_name}.json" predictor.config_to_file(predictor_config_path) predictor.model_to_file(predictor_net_path) if s3_bucket_to_write_to is not None: logger.info("Writing predictor artifacts to s3...") # Assumes aws credentials stored in ~/.aws/credentials that looks like: # [default] # aws_access_key_id = YOUR_ACCESS_KEY # aws_secret_access_key = YOUR_SECRET_KEY dir_to_zip = save_dir output_path = save_dir shutil.make_archive(output_path, "zip", dir_to_zip) s3_client = boto3.client("s3") s3_client.upload_file( Filename=f"{output_path}.zip", Bucket=s3_bucket_to_write_to, Key=f"{experiment_id}.zip", )
def test_same_predictions_country_as_categorical_sklearn_model_binary_reward( self): """ Tests sklearn Linear model + binary reward. """ reward_type = "binary" raw_data = shuffle(Datasets._raw_data_binary_reward) rand_idx = 0 test_input = raw_data.iloc[rand_idx] data = preprocessor.preprocess_data( raw_data, Params.EXPERIMENT_SPECIFIC_PARAMS_COUNTRY_AS_CATEGORICAL, reward_type, shuffle_data= False, # don't shuffle so we can test the same observation ) _X, _y = preprocessor.data_to_pytorch(data) X_COUNTRY_CATEG_BINARY_REWARD = { "X_train": { "X_float": _X["X_float"][:Datasets._offset_binary_reward] }, "y_train": _y[:Datasets._offset_binary_reward], "X_test": { "X_float": _X["X_float"][Datasets._offset_binary_reward:] }, "y_test": _y[Datasets._offset_binary_reward:], } model = model_constructors.build_linear_model(reward_type=reward_type) pre_serialized_predictor = BanditPredictor( experiment_params=Params. EXPERIMENT_SPECIFIC_PARAMS_COUNTRY_AS_CATEGORICAL, float_feature_order=Datasets. DATA_COUNTRY_CATEG_BINARY_REWARD["float_feature_order"], id_feature_order=Datasets. DATA_COUNTRY_CATEG_BINARY_REWARD["id_feature_order"], id_feature_str_to_int_map=Datasets. DATA_COUNTRY_CATEG_BINARY_REWARD["id_feature_str_to_int_map"], transforms=Datasets.DATA_COUNTRY_CATEG_BINARY_REWARD["transforms"], imputers=Datasets.DATA_COUNTRY_CATEG_BINARY_REWARD["imputers"], model=model, model_type="linear_bandit", reward_type=reward_type, model_spec=None, ) skorch_net = model_trainers.fit_sklearn_model( reward_type=reward_type, model=model, X=X_COUNTRY_CATEG_BINARY_REWARD["X_train"], y=X_COUNTRY_CATEG_BINARY_REWARD["y_train"], ) pre_serialized_predictor.config_to_file(self.tmp_config_path) pre_serialized_predictor.model_to_file(self.tmp_net_path) post_serialized_predictor = BanditPredictor.predictor_from_file( self.tmp_config_path, self.tmp_net_path) pre_pred = pre_serialized_predictor.predict( json.loads(test_input.context)) post_pred = post_serialized_predictor.predict( json.loads(test_input.context)) assert np.allclose(pre_pred["scores"], post_pred["scores"], self.tol) assert pre_pred["ids"] == post_pred["ids"]
def test_same_predictions_country_as_categorical_binary_reward(self): reward_type = "binary" raw_data = shuffle(Datasets._raw_data_binary_reward) rand_idx = 0 test_input = raw_data.iloc[rand_idx] data = preprocessor.preprocess_data( raw_data, Params.EXPERIMENT_SPECIFIC_PARAMS_COUNTRY_AS_CATEGORICAL, reward_type, shuffle_data= False, # don't shuffle so we can test the same observation ) _X, _y = preprocessor.data_to_pytorch(data) X_COUNTRY_CATEG_BINARY_REWARD = { "X_train": { "X_float": _X["X_float"][:Datasets._offset_binary_reward] }, "y_train": _y[:Datasets._offset_binary_reward], "X_test": { "X_float": _X["X_float"][Datasets._offset_binary_reward:] }, "y_test": _y[Datasets._offset_binary_reward:], } model_spec, pytorch_net = model_constructors.build_pytorch_net( feature_specs=Params. EXPERIMENT_SPECIFIC_PARAMS_COUNTRY_AS_CATEGORICAL["features"], product_sets=Params. EXPERIMENT_SPECIFIC_PARAMS_COUNTRY_AS_CATEGORICAL["product_sets"], float_feature_order=Datasets. DATA_COUNTRY_CATEG_BINARY_REWARD["final_float_feature_order"], id_feature_order=Datasets. DATA_COUNTRY_CATEG_BINARY_REWARD["final_id_feature_order"], reward_type=reward_type, layers=self.model_params["layers"], activations=self.model_params["activations"], input_dim=train_bandit.num_float_dim( Datasets.DATA_COUNTRY_CATEG_BINARY_REWARD), ) pre_serialized_predictor = BanditPredictor( experiment_params=Params. EXPERIMENT_SPECIFIC_PARAMS_COUNTRY_AS_CATEGORICAL, float_feature_order=Datasets. DATA_COUNTRY_CATEG_BINARY_REWARD["float_feature_order"], id_feature_order=Datasets. DATA_COUNTRY_CATEG_BINARY_REWARD["id_feature_order"], id_feature_str_to_int_map=Datasets. DATA_COUNTRY_CATEG_BINARY_REWARD["id_feature_str_to_int_map"], transforms=Datasets.DATA_COUNTRY_CATEG_BINARY_REWARD["transforms"], imputers=Datasets.DATA_COUNTRY_CATEG_BINARY_REWARD["imputers"], model=pytorch_net, model_type=self.model_type, reward_type=reward_type, model_spec=model_spec, ) skorch_net = model_trainers.fit_custom_pytorch_module_w_skorch( reward_type=reward_type, model=pre_serialized_predictor.model, X=X_COUNTRY_CATEG_BINARY_REWARD["X_train"], y=X_COUNTRY_CATEG_BINARY_REWARD["y_train"], hyperparams=self.model_params, ) pre_serialized_predictor.config_to_file(self.tmp_config_path) pre_serialized_predictor.model_to_file(self.tmp_net_path) post_serialized_predictor = BanditPredictor.predictor_from_file( self.tmp_config_path, self.tmp_net_path) pre_pred = pre_serialized_predictor.predict( json.loads(test_input.context)) post_pred = post_serialized_predictor.predict( json.loads(test_input.context)) assert np.allclose(pre_pred["scores"], post_pred["scores"], self.tol) assert pre_pred["ids"] == post_pred["ids"] # add a test case for missing features in provided context pre_pred_missing_feature = pre_serialized_predictor.predict({}) post_pred_missing_feature = post_serialized_predictor.predict({}) assert np.allclose( pre_pred_missing_feature["scores"], post_pred_missing_feature["scores"], self.tol, ) assert pre_pred_missing_feature["ids"] == post_pred_missing_feature[ "ids"] # add a test case for garbage feature keys provided in context pre_pred_garbage_feature = pre_serialized_predictor.predict( {"blah": 42}) post_pred_garbage_feature = post_serialized_predictor.predict( {"blah": 42}) assert np.allclose( pre_pred_garbage_feature["scores"], post_pred_garbage_feature["scores"], self.tol, ) assert pre_pred_garbage_feature["ids"] == post_pred_garbage_feature[ "ids"]
def train( ml_params: Dict, experiment_params: Dict, model_name: str = None, predictor_save_dir: str = None, s3_bucket_to_write_to: str = None, ): logger.info("Initializing data reader...") data_reader = BigQueryReader( credential_path=ml_params["data_reader"]["credential_path"], bq_project=ml_params["data_reader"]["bq_project"], bq_dataset=ml_params["data_reader"]["bq_dataset"], decisions_table=ml_params["data_reader"]["decisions_table"], rewards_table=ml_params["data_reader"]["rewards_table"], decisions_ds_start=ml_params["data_reader"]["decisions_ds_start"], decisions_ds_end=ml_params["data_reader"]["decisions_ds_end"], rewards_ds_end=ml_params["data_reader"]["rewards_ds_end"], experiment_id=experiment_params["experiment_id"], ) raw_data = data_reader.get_training_data() if len(raw_data) == 0: logger.error(f"Got no raws of training data. Training aborted.") sys.exit() logger.info(f"Got {len(raw_data)} rows of training data.") logger.info(raw_data.head()) data = preprocessor.preprocess_data( raw_data, ml_params["data_reader"]["reward_function"], experiment_params) X, y = preprocessor.data_to_pytorch(data) net_spec, pytorch_net = build_pytorch_net( feature_specs=experiment_params["features"], product_sets=experiment_params["product_sets"], float_feature_order=data["final_float_feature_order"], id_feature_order=data["final_id_feature_order"], layers=ml_params["model"]["layers"], activations=ml_params["model"]["activations"], dropout_ratio=ml_params["model"]["dropout_ratio"], input_dim=num_float_dim(data), ) logger.info(f"Initialized model: {pytorch_net}") logger.info(f"Starting training: {ml_params['max_epochs']} epochs") predictor = BanditPredictor( experiment_params=experiment_params, float_feature_order=data["float_feature_order"], id_feature_order=data["id_feature_order"], id_feature_str_to_int_map=data["id_feature_str_to_int_map"], transforms=data["transforms"], imputers=data["imputers"], net=pytorch_net, net_spec=net_spec, ) skorch_net = fit_custom_pytorch_module_w_skorch(module=predictor.net, X=X, y=y, hyperparams=ml_params) if predictor_save_dir is not None: logger.info("Saving predictor artifacts to disk...") experiment_id = experiment_params.get("experiment_id", "test") model_name = experiment_params.get("model_name", "model") save_dir = f"{predictor_save_dir}/{experiment_id}" if not os.path.exists(save_dir): os.makedirs(save_dir) predictor_net_path = f"{save_dir}/{model_name}.pt" predictor_config_path = f"{save_dir}/{model_name}.json" predictor.config_to_file(predictor_config_path) predictor.net_to_file(predictor_net_path) if s3_bucket_to_write_to is not None: logger.info("Writing predictor artifacts to s3...") # Assumes aws credentials stored in ~/.aws/credentials that looks like: # [default] # aws_access_key_id = YOUR_ACCESS_KEY # aws_secret_access_key = YOUR_SECRET_KEY dir_to_zip = save_dir output_path = save_dir shutil.make_archive(output_path, "zip", dir_to_zip) s3_client = boto3.client("s3") s3_client.upload_file( Filename=f"{output_path}.zip", Bucket=s3_bucket_to_write_to, Key=f"{experiment_id}.zip", )