Exemple #1
0
    def test_same_predictions_country_as_categorical(self):
        raw_data = shuffle(Datasets._raw_data)
        rand_idx = 0
        test_input = raw_data.iloc[rand_idx]

        data = preprocessor.preprocess_data(
            raw_data,
            self.ml_params["data_reader"]["reward_function"],
            Params.EXPERIMENT_SPECIFIC_PARAMS_COUNTRY_AS_CATEGORICAL,
            shuffle_data=False,  # don't shuffle so we can test the same observation
        )

        _X, _y = preprocessor.data_to_pytorch(data)

        X_COUNTRY_CATEG = {
            "X_train": {"X_float": _X["X_float"][: Datasets._offset]},
            "y_train": _y[: Datasets._offset],
            "X_test": {"X_float": _X["X_float"][Datasets._offset :]},
            "y_test": _y[Datasets._offset :],
        }

        net_spec, pytorch_net = train_bandit.build_pytorch_net(
            feature_specs=Params.EXPERIMENT_SPECIFIC_PARAMS_COUNTRY_AS_CATEGORICAL[
                "features"
            ],
            product_sets=Params.EXPERIMENT_SPECIFIC_PARAMS_COUNTRY_AS_CATEGORICAL[
                "product_sets"
            ],
            float_feature_order=Datasets.DATA_COUNTRY_CATEG[
                "final_float_feature_order"
            ],
            id_feature_order=Datasets.DATA_COUNTRY_CATEG["final_id_feature_order"],
            layers=self.ml_params["model"]["layers"],
            activations=self.ml_params["model"]["activations"],
            input_dim=train_bandit.num_float_dim(Datasets.DATA_COUNTRY_CATEG),
        )

        pre_serialized_predictor = BanditPredictor(
            experiment_params=Params.EXPERIMENT_SPECIFIC_PARAMS_COUNTRY_AS_CATEGORICAL,
            float_feature_order=Datasets.DATA_COUNTRY_CATEG["float_feature_order"],
            id_feature_order=Datasets.DATA_COUNTRY_CATEG["id_feature_order"],
            id_feature_str_to_int_map=Datasets.DATA_COUNTRY_CATEG[
                "id_feature_str_to_int_map"
            ],
            transforms=Datasets.DATA_COUNTRY_CATEG["transforms"],
            imputers=Datasets.DATA_COUNTRY_CATEG["imputers"],
            net=pytorch_net,
            net_spec=net_spec,
        )

        skorch_net = train_bandit.fit_custom_pytorch_module_w_skorch(
            module=pre_serialized_predictor.net,
            X=X_COUNTRY_CATEG["X_train"],
            y=X_COUNTRY_CATEG["y_train"],
            hyperparams=self.ml_params,
        )

        pre_serialized_predictor.config_to_file(self.tmp_config_path)
        pre_serialized_predictor.net_to_file(self.tmp_net_path)

        post_serialized_predictor = BanditPredictor.predictor_from_file(
            self.tmp_config_path, self.tmp_net_path
        )

        pre_pred = pre_serialized_predictor.predict(json.loads(test_input.context))
        post_pred = post_serialized_predictor.predict(json.loads(test_input.context))

        assert np.allclose(pre_pred["scores"], post_pred["scores"], self.tol)
        assert pre_pred["ids"] == post_pred["ids"]
Exemple #2
0
def train(
    ml_params: Dict,
    experiment_params: Dict,
    predictor_save_dir: str = None,
    s3_bucket_to_write_to: str = None,
):

    logger.info("Initializing data reader...")
    data_reader = BigQueryReader(
        credential_path=ml_params["data_reader"]["credential_path"],
        bq_project=ml_params["data_reader"]["bq_project"],
        bq_dataset=ml_params["data_reader"]["bq_dataset"],
        decisions_ds_start=ml_params["data_reader"]["decisions_ds_start"],
        decisions_ds_end=ml_params["data_reader"]["decisions_ds_end"],
        rewards_ds_end=ml_params["data_reader"]["rewards_ds_end"],
        reward_function=ml_params["data_reader"]["reward_function"],
        experiment_id=experiment_params["experiment_id"],
    )

    raw_data = data_reader.get_training_data()

    if len(raw_data) == 0:
        logger.error(f"Got no raws of training data. Training aborted.")
        sys.exit()
    logger.info(f"Got {len(raw_data)} rows of training data.")
    logger.info(raw_data.head())

    utils.fancy_print("Kicking off data preprocessing")

    # always add decision as a feature to use if not using all features
    features_to_use = ml_params["data_reader"].get("features_to_use", ["*"])
    if features_to_use != ["*"]:
        features_to_use.append(preprocessor.DECISION_FEATURE_NAME)
    features_to_use = list(set(features_to_use))
    dense_features_to_use = ml_params["data_reader"].get("dense_features_to_use", ["*"])

    data = preprocessor.preprocess_data(
        raw_data,
        experiment_params,
        ml_params["reward_type"],
        features_to_use,
        dense_features_to_use,
    )
    X, y = preprocessor.data_to_pytorch(data)

    model_type = ml_params["model_type"]
    model_params = ml_params["model_params"][model_type]
    reward_type = ml_params["reward_type"]

    feature_importance_params = ml_params.get("feature_importance", {})
    if feature_importance_params.get("calc_feature_importance", False):
        # calculate feature importances - only works on non id list features at this time
        utils.fancy_print("Calculating feature importances")
        feature_scores = feature_importance.calculate_feature_importance(
            reward_type=reward_type,
            feature_names=data["final_float_feature_order"],
            X=X,
            y=y,
        )
        feature_importance.display_feature_importances(feature_scores)

        # TODO: Make keeping the top "n" features work in predictor. Right now
        # using this feature breaks predictor, so don't use it in a final model,
        # just use it to experiment in seeing how model performance is.
        if feature_importance_params.get("keep_only_top_n", False):
            utils.fancy_print("Keeping only top N features")
            X, final_float_feature_order = feature_importance.keep_top_n_features(
                n=feature_importance_params["n"],
                X=X,
                feature_order=data["final_float_feature_order"],
                feature_scores=feature_scores,
            )
            data["final_float_feature_order"] = final_float_feature_order
            logger.info(f"Keeping top {feature_importance_params['n']} features:")
            logger.info(final_float_feature_order)

    utils.fancy_print("Starting training")
    # build the model
    if model_type == "neural_bandit":
        model_spec, model = model_constructors.build_pytorch_net(
            feature_specs=experiment_params["features"],
            product_sets=experiment_params["product_sets"],
            float_feature_order=data["final_float_feature_order"],
            id_feature_order=data["final_id_feature_order"],
            reward_type=reward_type,
            layers=model_params["layers"],
            activations=model_params["activations"],
            dropout_ratio=model_params["dropout_ratio"],
            input_dim=num_float_dim(data),
        )
        logger.info(f"Initialized model: {model}")
    elif model_type == "linear_bandit":
        assert utils.pset_features_have_dense(experiment_params["features"]), (
            "Linear models require that product set features have associated"
            "dense representations."
        )
        model = model_constructors.build_linear_model(
            reward_type=reward_type,
            penalty=model_params.get("penalty"),
            alpha=model_params.get("alpha"),
        )
        model_spec = None
    elif model_type == "gbdt_bandit":
        assert utils.pset_features_have_dense(experiment_params["features"]), (
            "GBDT models require that product set features have associated"
            "dense representations."
        )
        model = model_constructors.build_gbdt(
            reward_type=reward_type,
            learning_rate=model_params["learning_rate"],
            n_estimators=model_params["n_estimators"],
            max_depth=model_params["max_depth"],
        )
        model_spec = None
    elif model_type == "random_forest_bandit":
        assert utils.pset_features_have_dense(experiment_params["features"]), (
            "Random forest models require that product set features have associated"
            "dense representations."
        )
        model = model_constructors.build_random_forest(
            reward_type=reward_type,
            n_estimators=model_params["n_estimators"],
            max_depth=model_params["max_depth"],
        )
        model_spec = None

    # build the predictor
    predictor = BanditPredictor(
        experiment_params=experiment_params,
        float_feature_order=data["float_feature_order"],
        id_feature_order=data["id_feature_order"],
        id_feature_str_to_int_map=data["id_feature_str_to_int_map"],
        transforms=data["transforms"],
        imputers=data["imputers"],
        model=model,
        model_type=model_type,
        reward_type=reward_type,
        model_spec=model_spec,
        dense_features_to_use=dense_features_to_use,
    )

    # train the model
    if model_type == "neural_bandit":
        logger.info(f"Training {model_type} for {model_params['max_epochs']} epochs")
        skorch_net = model_trainers.fit_custom_pytorch_module_w_skorch(
            reward_type=reward_type,
            model=predictor.model,
            X=X,
            y=y,
            hyperparams=model_params,
            train_percent=ml_params["train_percent"],
        )
    elif model_type in ("gbdt_bandit", "random_forest_bandit", "linear_bandit"):
        logger.info(f"Training {model_type}")
        sklearn_model, _ = model_trainers.fit_sklearn_model(
            reward_type=reward_type,
            model=model,
            X=X,
            y=y,
            train_percent=ml_params["train_percent"],
        )

    if predictor_save_dir is not None:
        logger.info("Saving predictor artifacts to disk...")
        experiment_id = experiment_params.get("experiment_id", "test")
        model_name = ml_params.get("model_name", "model")

        save_dir = f"{predictor_save_dir}/{experiment_id}"
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        predictor_net_path = f"{save_dir}/{model_name}.pt"
        predictor_config_path = f"{save_dir}/{model_name}.json"
        predictor.config_to_file(predictor_config_path)
        predictor.model_to_file(predictor_net_path)

        if s3_bucket_to_write_to is not None:
            logger.info("Writing predictor artifacts to s3...")
            # Assumes aws credentials stored in ~/.aws/credentials that looks like:
            # [default]
            # aws_access_key_id = YOUR_ACCESS_KEY
            # aws_secret_access_key = YOUR_SECRET_KEY
            dir_to_zip = save_dir
            output_path = save_dir
            shutil.make_archive(output_path, "zip", dir_to_zip)
            s3_client = boto3.client("s3")
            s3_client.upload_file(
                Filename=f"{output_path}.zip",
                Bucket=s3_bucket_to_write_to,
                Key=f"{experiment_id}.zip",
            )
Exemple #3
0
class Datasets:
    TEST_DIR = os.path.dirname(os.path.abspath(__file__))
    TEST_DATASET_DIR = "datasets"

    # continuous reward
    TEST_DATASET_FILENAME = "height_dataset.csv"
    DATASET_PATH = os.path.join(TEST_DIR, TEST_DATASET_DIR,
                                TEST_DATASET_FILENAME)

    _raw_data = pd.read_csv(DATASET_PATH)
    _offset = int(len(_raw_data) * Params.ML_PARAMS["train_percent"])

    # dataset for country as categorical variable
    DATA_COUNTRY_CATEG = preprocessor.preprocess_data(
        _raw_data,
        Params.ML_PARAMS["data_reader"]["reward_function"],
        Params.EXPERIMENT_SPECIFIC_PARAMS_COUNTRY_AS_CATEGORICAL,
    )
    _X, _y = preprocessor.data_to_pytorch(DATA_COUNTRY_CATEG)
    X_COUNTRY_CATEG = {
        "X_train": {
            "X_float": _X["X_float"][:_offset]
        },
        "y_train": _y[:_offset],
        "X_test": {
            "X_float": _X["X_float"][_offset:]
        },
        "y_test": _y[_offset:],
    }

    # dataset for country as ID list variable
    DATA_COUNTRY_ID_LIST = preprocessor.preprocess_data(
        _raw_data,
        Params.ML_PARAMS["data_reader"]["reward_function"],
        Params.EXPERIMENT_SPECIFIC_PARAMS_COUNTRY_AS_ID_LIST,
    )
    _X, _y = preprocessor.data_to_pytorch(DATA_COUNTRY_ID_LIST)
    X_COUNTRY_ID_LIST = {
        "X_train": {
            "X_float": _X["X_float"][:_offset],
            "X_id_list": _X["X_id_list"][:_offset],
            "X_id_list_idxs": _X["X_id_list_idxs"][:_offset],
        },
        "y_train": _y[:_offset],
        "X_test": {
            "X_float": _X["X_float"][_offset:],
            "X_id_list": _X["X_id_list"][_offset:],
            "X_id_list_idxs": _X["X_id_list_idxs"][_offset:],
        },
        "y_test": _y[_offset:],
    }

    # dataset for country as dense ID list variable
    DATA_COUNTRY_DENSE_ID_LIST = preprocessor.preprocess_data(
        _raw_data,
        Params.ML_PARAMS["data_reader"]["reward_function"],
        Params.EXPERIMENT_SPECIFIC_PARAMS_COUNTRY_AS_DENSE_ID_LIST,
    )
    _X, _y = preprocessor.data_to_pytorch(DATA_COUNTRY_DENSE_ID_LIST)
    X_COUNTRY_DENSE_ID_LIST = {
        "X_train": {
            "X_float": _X["X_float"][:_offset]
        },
        "y_train": _y[:_offset],
        "X_test": {
            "X_float": _X["X_float"][_offset:]
        },
        "y_test": _y[_offset:],
    }

    # dataset for country as ID list AND decision as ID list variables
    DATA_COUNTRY_AND_DECISION_ID_LIST = preprocessor.preprocess_data(
        _raw_data,
        Params.ML_PARAMS["data_reader"]["reward_function"],
        Params.EXPERIMENT_SPECIFIC_PARAMS_COUNTRY_AND_DECISION_AS_ID_LIST,
    )
    _X, _y = preprocessor.data_to_pytorch(DATA_COUNTRY_AND_DECISION_ID_LIST)
    X_COUNTRY_AND_DECISION_ID_LIST = {
        "X_train": {
            "X_float": _X["X_float"][:_offset],
            "X_id_list": _X["X_id_list"][:_offset],
            "X_id_list_idxs": _X["X_id_list_idxs"][:_offset],
        },
        "y_train": _y[:_offset],
        "X_test": {
            "X_float": _X["X_float"][_offset:],
            "X_id_list": _X["X_id_list"][_offset:],
            "X_id_list_idxs": _X["X_id_list_idxs"][_offset:],
        },
        "y_test": _y[_offset:],
    }

    # binary reward
    TEST_BINARY_REWARD_DATASET_FILENAME = "height_dataset_binary.csv"
    BINARY_REWARD_DATASET_PATH = os.path.join(
        TEST_DIR, TEST_DATASET_DIR, TEST_BINARY_REWARD_DATASET_FILENAME)

    _raw_data_binary_reward = pd.read_csv(BINARY_REWARD_DATASET_PATH)
    _offset_binary_reward = int(
        len(_raw_data_binary_reward) * Params.ML_PARAMS["train_percent"])

    # dataset for country as categorical variable & binary reward
    DATA_COUNTRY_CATEG_BINARY_REWARD = preprocessor.preprocess_data(
        _raw_data_binary_reward,
        Params.REWARD_FUNCTION_BINARY,
        Params.EXPERIMENT_SPECIFIC_PARAMS_COUNTRY_AS_CATEGORICAL,
    )
    _X_binary_reward, _y_binary_reward = preprocessor.data_to_pytorch(
        DATA_COUNTRY_CATEG_BINARY_REWARD)
    X_COUNTRY_CATEG_BINARY_REWARD = {
        "X_train": {
            "X_float": _X_binary_reward["X_float"][:_offset_binary_reward]
        },
        "y_train": _y_binary_reward[:_offset_binary_reward],
        "X_test": {
            "X_float": _X_binary_reward["X_float"][_offset_binary_reward:]
        },
        "y_test": _y_binary_reward[_offset_binary_reward:],
    }
Exemple #4
0
def train(
    ml_params: Dict,
    experiment_params: Dict,
    model_name: str = None,
    predictor_save_dir: str = None,
    s3_bucket_to_write_to: str = None,
):

    logger.info("Initializing data reader...")
    data_reader = BigQueryReader(
        credential_path=ml_params["data_reader"]["credential_path"],
        bq_project=ml_params["data_reader"]["bq_project"],
        bq_dataset=ml_params["data_reader"]["bq_dataset"],
        decisions_table=ml_params["data_reader"]["decisions_table"],
        rewards_table=ml_params["data_reader"]["rewards_table"],
        decisions_ds_start=ml_params["data_reader"]["decisions_ds_start"],
        decisions_ds_end=ml_params["data_reader"]["decisions_ds_end"],
        rewards_ds_end=ml_params["data_reader"]["rewards_ds_end"],
        experiment_id=experiment_params["experiment_id"],
    )

    raw_data = data_reader.get_training_data()

    if len(raw_data) == 0:
        logger.error(f"Got no raws of training data. Training aborted.")
        sys.exit()
    logger.info(f"Got {len(raw_data)} rows of training data.")
    logger.info(raw_data.head())

    logger.info("Kicking off data preprocessing...")
    data = preprocessor.preprocess_data(
        raw_data, ml_params["data_reader"]["reward_function"],
        experiment_params)
    X, y = preprocessor.data_to_pytorch(data)

    model_type = ml_params["model_type"]
    model_params = ml_params["model_params"][model_type]
    reward_type = ml_params["reward_type"]

    # build the model
    if model_type == "neural_bandit":
        model_spec, model = model_constructors.build_pytorch_net(
            feature_specs=experiment_params["features"],
            product_sets=experiment_params["product_sets"],
            float_feature_order=data["final_float_feature_order"],
            id_feature_order=data["final_id_feature_order"],
            reward_type=reward_type,
            layers=model_params["layers"],
            activations=model_params["activations"],
            dropout_ratio=model_params["dropout_ratio"],
            input_dim=num_float_dim(data),
        )
        logger.info(f"Initialized model: {model}")
    elif model_type == "gbdt_bandit":
        assert utils.pset_features_have_dense(experiment_params["features"]), (
            "GBDT models require that product set features have associated"
            "dense reprenstations.")
        model = model_constructors.build_gbdt(
            reward_type=reward_type,
            learning_rate=model_params["learning_rate"],
            n_estimators=model_params["n_estimators"],
            max_depth=model_params["max_depth"],
        )
        model_spec = None
    elif model_type == "random_forest_bandit":
        assert utils.pset_features_have_dense(experiment_params["features"]), (
            "Random forest models require that product set features have associated"
            "dense reprenstations.")
        model = model_constructors.build_random_forest(
            reward_type=reward_type,
            n_estimators=model_params["n_estimators"],
            max_depth=model_params["max_depth"],
        )
        model_spec = None

    # build the predictor
    predictor = BanditPredictor(
        experiment_params=experiment_params,
        float_feature_order=data["float_feature_order"],
        id_feature_order=data["id_feature_order"],
        id_feature_str_to_int_map=data["id_feature_str_to_int_map"],
        transforms=data["transforms"],
        imputers=data["imputers"],
        model=model,
        reward_type=reward_type,
        model_spec=model_spec,
    )

    # train the model
    if model_type == "neural_bandit":
        logger.info(f"Starting training: {model_params} epochs")
        skorch_net = model_trainers.fit_custom_pytorch_module_w_skorch(
            reward_type=reward_type,
            model=predictor.model,
            X=X,
            y=y,
            hyperparams=model_params,
            train_percent=ml_params["train_percent"],
        )
    elif model_type in ("gbdt_bandit", "random_forest_bandit"):
        logger.info(f"Starting training: {model_type}")
        sklearn_model, _ = model_trainers.fit_sklearn_model(
            reward_type=reward_type,
            model=model,
            X=X,
            y=y,
            train_percent=ml_params["train_percent"],
        )

    if predictor_save_dir is not None:
        logger.info("Saving predictor artifacts to disk...")
        experiment_id = experiment_params.get("experiment_id", "test")
        model_name = experiment_params.get("model_name", "model")

        save_dir = f"{predictor_save_dir}/{experiment_id}"
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        predictor_net_path = f"{save_dir}/{model_name}.pt"
        predictor_config_path = f"{save_dir}/{model_name}.json"
        predictor.config_to_file(predictor_config_path)
        predictor.model_to_file(predictor_net_path)

        if s3_bucket_to_write_to is not None:
            logger.info("Writing predictor artifacts to s3...")
            # Assumes aws credentials stored in ~/.aws/credentials that looks like:
            # [default]
            # aws_access_key_id = YOUR_ACCESS_KEY
            # aws_secret_access_key = YOUR_SECRET_KEY
            dir_to_zip = save_dir
            output_path = save_dir
            shutil.make_archive(output_path, "zip", dir_to_zip)
            s3_client = boto3.client("s3")
            s3_client.upload_file(
                Filename=f"{output_path}.zip",
                Bucket=s3_bucket_to_write_to,
                Key=f"{experiment_id}.zip",
            )
Exemple #5
0
class Datasets:
    TEST_DIR = os.path.dirname(os.path.abspath(__file__))
    TEST_DATASET_DIR = "datasets"
    TEST_DATASET_FILENAME = "height_dataset.csv"
    DATASET_PATH = os.path.join(TEST_DIR, TEST_DATASET_DIR, TEST_DATASET_FILENAME)

    _raw_data = pd.read_csv(DATASET_PATH)
    _offset = int(len(_raw_data) * Params.SHARED_PARAMS["train_test_split"])

    # dataset for country as categorical variable
    DATA_COUNTRY_CATEG = preprocessor.preprocess_data(
        _raw_data, Params.EXPERIMENT_SPECIFIC_PARAMS_COUNTRY_AS_CATEGORICAL
    )
    _X, _y = preprocessor.data_to_pytorch(DATA_COUNTRY_CATEG)
    X_COUNTRY_CATEG = {
        "X_train": {"X_float": _X["X_float"][:_offset]},
        "y_train": _y[:_offset],
        "X_test": {"X_float": _X["X_float"][_offset:]},
        "y_test": _y[_offset:],
    }

    # dataset for country as ID list variable
    DATA_COUNTRY_ID_LIST = preprocessor.preprocess_data(
        _raw_data, Params.EXPERIMENT_SPECIFIC_PARAMS_COUNTRY_AS_ID_LIST
    )
    _X, _y = preprocessor.data_to_pytorch(DATA_COUNTRY_ID_LIST)
    X_COUNTRY_ID_LIST = {
        "X_train": {
            "X_float": _X["X_float"][:_offset],
            "X_id_list": _X["X_id_list"][:_offset],
            "X_id_list_idxs": _X["X_id_list_idxs"][:_offset],
        },
        "y_train": _y[:_offset],
        "X_test": {
            "X_float": _X["X_float"][_offset:],
            "X_id_list": _X["X_id_list"][_offset:],
            "X_id_list_idxs": _X["X_id_list_idxs"][_offset:],
        },
        "y_test": _y[_offset:],
    }

    # dataset for country as dense ID list variable
    DATA_COUNTRY_DENSE_ID_LIST = preprocessor.preprocess_data(
        _raw_data, Params.EXPERIMENT_SPECIFIC_PARAMS_COUNTRY_AS_DENSE_ID_LIST
    )
    _X, _y = preprocessor.data_to_pytorch(DATA_COUNTRY_DENSE_ID_LIST)
    X_COUNTRY_DENSE_ID_LIST = {
        "X_train": {"X_float": _X["X_float"][:_offset]},
        "y_train": _y[:_offset],
        "X_test": {"X_float": _X["X_float"][_offset:]},
        "y_test": _y[_offset:],
    }

    # dataset for country as ID list AND decision as ID list variables
    DATA_COUNTRY_AND_DECISION_ID_LIST = preprocessor.preprocess_data(
        _raw_data, Params.EXPERIMENT_SPECIFIC_PARAMS_COUNTRY_AND_DECISION_AS_ID_LIST
    )
    _X, _y = preprocessor.data_to_pytorch(DATA_COUNTRY_AND_DECISION_ID_LIST)
    X_COUNTRY_AND_DECISION_ID_LIST = {
        "X_train": {
            "X_float": _X["X_float"][:_offset],
            "X_id_list": _X["X_id_list"][:_offset],
            "X_id_list_idxs": _X["X_id_list_idxs"][:_offset],
        },
        "y_train": _y[:_offset],
        "X_test": {
            "X_float": _X["X_float"][_offset:],
            "X_id_list": _X["X_id_list"][_offset:],
            "X_id_list_idxs": _X["X_id_list_idxs"][_offset:],
        },
        "y_test": _y[_offset:],
    }
Exemple #6
0
    def test_same_predictions_country_as_categorical_sklearn_model_binary_reward(
            self):
        """
        Tests sklearn Linear model + binary reward.
        """
        reward_type = "binary"

        raw_data = shuffle(Datasets._raw_data_binary_reward)
        rand_idx = 0
        test_input = raw_data.iloc[rand_idx]

        data = preprocessor.preprocess_data(
            raw_data,
            Params.EXPERIMENT_SPECIFIC_PARAMS_COUNTRY_AS_CATEGORICAL,
            reward_type,
            shuffle_data=
            False,  # don't shuffle so we can test the same observation
        )

        _X, _y = preprocessor.data_to_pytorch(data)
        X_COUNTRY_CATEG_BINARY_REWARD = {
            "X_train": {
                "X_float": _X["X_float"][:Datasets._offset_binary_reward]
            },
            "y_train": _y[:Datasets._offset_binary_reward],
            "X_test": {
                "X_float": _X["X_float"][Datasets._offset_binary_reward:]
            },
            "y_test": _y[Datasets._offset_binary_reward:],
        }

        model = model_constructors.build_linear_model(reward_type=reward_type)

        pre_serialized_predictor = BanditPredictor(
            experiment_params=Params.
            EXPERIMENT_SPECIFIC_PARAMS_COUNTRY_AS_CATEGORICAL,
            float_feature_order=Datasets.
            DATA_COUNTRY_CATEG_BINARY_REWARD["float_feature_order"],
            id_feature_order=Datasets.
            DATA_COUNTRY_CATEG_BINARY_REWARD["id_feature_order"],
            id_feature_str_to_int_map=Datasets.
            DATA_COUNTRY_CATEG_BINARY_REWARD["id_feature_str_to_int_map"],
            transforms=Datasets.DATA_COUNTRY_CATEG_BINARY_REWARD["transforms"],
            imputers=Datasets.DATA_COUNTRY_CATEG_BINARY_REWARD["imputers"],
            model=model,
            model_type="linear_bandit",
            reward_type=reward_type,
            model_spec=None,
        )

        skorch_net = model_trainers.fit_sklearn_model(
            reward_type=reward_type,
            model=model,
            X=X_COUNTRY_CATEG_BINARY_REWARD["X_train"],
            y=X_COUNTRY_CATEG_BINARY_REWARD["y_train"],
        )

        pre_serialized_predictor.config_to_file(self.tmp_config_path)
        pre_serialized_predictor.model_to_file(self.tmp_net_path)

        post_serialized_predictor = BanditPredictor.predictor_from_file(
            self.tmp_config_path, self.tmp_net_path)

        pre_pred = pre_serialized_predictor.predict(
            json.loads(test_input.context))
        post_pred = post_serialized_predictor.predict(
            json.loads(test_input.context))

        assert np.allclose(pre_pred["scores"], post_pred["scores"], self.tol)
        assert pre_pred["ids"] == post_pred["ids"]
Exemple #7
0
    def test_same_predictions_country_as_categorical_binary_reward(self):
        reward_type = "binary"

        raw_data = shuffle(Datasets._raw_data_binary_reward)
        rand_idx = 0
        test_input = raw_data.iloc[rand_idx]

        data = preprocessor.preprocess_data(
            raw_data,
            Params.EXPERIMENT_SPECIFIC_PARAMS_COUNTRY_AS_CATEGORICAL,
            reward_type,
            shuffle_data=
            False,  # don't shuffle so we can test the same observation
        )

        _X, _y = preprocessor.data_to_pytorch(data)
        X_COUNTRY_CATEG_BINARY_REWARD = {
            "X_train": {
                "X_float": _X["X_float"][:Datasets._offset_binary_reward]
            },
            "y_train": _y[:Datasets._offset_binary_reward],
            "X_test": {
                "X_float": _X["X_float"][Datasets._offset_binary_reward:]
            },
            "y_test": _y[Datasets._offset_binary_reward:],
        }

        model_spec, pytorch_net = model_constructors.build_pytorch_net(
            feature_specs=Params.
            EXPERIMENT_SPECIFIC_PARAMS_COUNTRY_AS_CATEGORICAL["features"],
            product_sets=Params.
            EXPERIMENT_SPECIFIC_PARAMS_COUNTRY_AS_CATEGORICAL["product_sets"],
            float_feature_order=Datasets.
            DATA_COUNTRY_CATEG_BINARY_REWARD["final_float_feature_order"],
            id_feature_order=Datasets.
            DATA_COUNTRY_CATEG_BINARY_REWARD["final_id_feature_order"],
            reward_type=reward_type,
            layers=self.model_params["layers"],
            activations=self.model_params["activations"],
            input_dim=train_bandit.num_float_dim(
                Datasets.DATA_COUNTRY_CATEG_BINARY_REWARD),
        )

        pre_serialized_predictor = BanditPredictor(
            experiment_params=Params.
            EXPERIMENT_SPECIFIC_PARAMS_COUNTRY_AS_CATEGORICAL,
            float_feature_order=Datasets.
            DATA_COUNTRY_CATEG_BINARY_REWARD["float_feature_order"],
            id_feature_order=Datasets.
            DATA_COUNTRY_CATEG_BINARY_REWARD["id_feature_order"],
            id_feature_str_to_int_map=Datasets.
            DATA_COUNTRY_CATEG_BINARY_REWARD["id_feature_str_to_int_map"],
            transforms=Datasets.DATA_COUNTRY_CATEG_BINARY_REWARD["transforms"],
            imputers=Datasets.DATA_COUNTRY_CATEG_BINARY_REWARD["imputers"],
            model=pytorch_net,
            model_type=self.model_type,
            reward_type=reward_type,
            model_spec=model_spec,
        )

        skorch_net = model_trainers.fit_custom_pytorch_module_w_skorch(
            reward_type=reward_type,
            model=pre_serialized_predictor.model,
            X=X_COUNTRY_CATEG_BINARY_REWARD["X_train"],
            y=X_COUNTRY_CATEG_BINARY_REWARD["y_train"],
            hyperparams=self.model_params,
        )

        pre_serialized_predictor.config_to_file(self.tmp_config_path)
        pre_serialized_predictor.model_to_file(self.tmp_net_path)

        post_serialized_predictor = BanditPredictor.predictor_from_file(
            self.tmp_config_path, self.tmp_net_path)

        pre_pred = pre_serialized_predictor.predict(
            json.loads(test_input.context))
        post_pred = post_serialized_predictor.predict(
            json.loads(test_input.context))

        assert np.allclose(pre_pred["scores"], post_pred["scores"], self.tol)
        assert pre_pred["ids"] == post_pred["ids"]

        # add a test case for missing features in provided context
        pre_pred_missing_feature = pre_serialized_predictor.predict({})
        post_pred_missing_feature = post_serialized_predictor.predict({})

        assert np.allclose(
            pre_pred_missing_feature["scores"],
            post_pred_missing_feature["scores"],
            self.tol,
        )
        assert pre_pred_missing_feature["ids"] == post_pred_missing_feature[
            "ids"]

        # add a test case for garbage feature keys provided in context
        pre_pred_garbage_feature = pre_serialized_predictor.predict(
            {"blah": 42})
        post_pred_garbage_feature = post_serialized_predictor.predict(
            {"blah": 42})

        assert np.allclose(
            pre_pred_garbage_feature["scores"],
            post_pred_garbage_feature["scores"],
            self.tol,
        )
        assert pre_pred_garbage_feature["ids"] == post_pred_garbage_feature[
            "ids"]
Exemple #8
0
def train(
    ml_params: Dict,
    experiment_params: Dict,
    model_name: str = None,
    predictor_save_dir: str = None,
    s3_bucket_to_write_to: str = None,
):

    logger.info("Initializing data reader...")
    data_reader = BigQueryReader(
        credential_path=ml_params["data_reader"]["credential_path"],
        bq_project=ml_params["data_reader"]["bq_project"],
        bq_dataset=ml_params["data_reader"]["bq_dataset"],
        decisions_table=ml_params["data_reader"]["decisions_table"],
        rewards_table=ml_params["data_reader"]["rewards_table"],
        decisions_ds_start=ml_params["data_reader"]["decisions_ds_start"],
        decisions_ds_end=ml_params["data_reader"]["decisions_ds_end"],
        rewards_ds_end=ml_params["data_reader"]["rewards_ds_end"],
        experiment_id=experiment_params["experiment_id"],
    )

    raw_data = data_reader.get_training_data()
    if len(raw_data) == 0:
        logger.error(f"Got no raws of training data. Training aborted.")
        sys.exit()
    logger.info(f"Got {len(raw_data)} rows of training data.")
    logger.info(raw_data.head())

    data = preprocessor.preprocess_data(
        raw_data, ml_params["data_reader"]["reward_function"],
        experiment_params)
    X, y = preprocessor.data_to_pytorch(data)

    net_spec, pytorch_net = build_pytorch_net(
        feature_specs=experiment_params["features"],
        product_sets=experiment_params["product_sets"],
        float_feature_order=data["final_float_feature_order"],
        id_feature_order=data["final_id_feature_order"],
        layers=ml_params["model"]["layers"],
        activations=ml_params["model"]["activations"],
        dropout_ratio=ml_params["model"]["dropout_ratio"],
        input_dim=num_float_dim(data),
    )
    logger.info(f"Initialized model: {pytorch_net}")

    logger.info(f"Starting training: {ml_params['max_epochs']} epochs")

    predictor = BanditPredictor(
        experiment_params=experiment_params,
        float_feature_order=data["float_feature_order"],
        id_feature_order=data["id_feature_order"],
        id_feature_str_to_int_map=data["id_feature_str_to_int_map"],
        transforms=data["transforms"],
        imputers=data["imputers"],
        net=pytorch_net,
        net_spec=net_spec,
    )

    skorch_net = fit_custom_pytorch_module_w_skorch(module=predictor.net,
                                                    X=X,
                                                    y=y,
                                                    hyperparams=ml_params)

    if predictor_save_dir is not None:
        logger.info("Saving predictor artifacts to disk...")
        experiment_id = experiment_params.get("experiment_id", "test")
        model_name = experiment_params.get("model_name", "model")

        save_dir = f"{predictor_save_dir}/{experiment_id}"
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        predictor_net_path = f"{save_dir}/{model_name}.pt"
        predictor_config_path = f"{save_dir}/{model_name}.json"
        predictor.config_to_file(predictor_config_path)
        predictor.net_to_file(predictor_net_path)

        if s3_bucket_to_write_to is not None:
            logger.info("Writing predictor artifacts to s3...")
            # Assumes aws credentials stored in ~/.aws/credentials that looks like:
            # [default]
            # aws_access_key_id = YOUR_ACCESS_KEY
            # aws_secret_access_key = YOUR_SECRET_KEY
            dir_to_zip = save_dir
            output_path = save_dir
            shutil.make_archive(output_path, "zip", dir_to_zip)
            s3_client = boto3.client("s3")
            s3_client.upload_file(
                Filename=f"{output_path}.zip",
                Bucket=s3_bucket_to_write_to,
                Key=f"{experiment_id}.zip",
            )