def g(f: t.Callable[[mx.VarArg(int)], t.List[int]], *xs: int) -> t.List[int]: return f(*xs)
return tc(*args, **kwargs) # This line will only be reached when a testcase was skipped. # Return `None` as the placeholder return value. return None setattr(wrapped, "_tbot_testcase", name) return typing.cast(F_tc, wrapped) return _named_testcase F_lh = typing.TypeVar("F_lh", bound=typing.Callable[..., typing.Any]) F_lab = typing.Callable[[ mypy.DefaultArg(typing.Optional[linux.Lab], "lab"), mypy.VarArg(typing.Any), mypy.KwArg(typing.Any), ], typing.Any, ] def with_lab(tc: F_lh) -> F_lab: """ Decorate a function to automatically supply the lab-host as an argument. The idea is that when using this decorator and calling the testcase without a lab-host, tbot will automatically acquire the default lab. **Example**:: from tbot.machine import linux
def call(f: t.Callable[[mx.VarArg(str)], str], *args: str) -> str: return f(*args)
def create0( ) -> t.Tuple[t.Callable[[t.List[int], mx.VarArg(int)], int], int, int]: return action0, 10, 20
class NNPolicyLearner(BaseOfflinePolicyLearner): """Off-policy learner using a neural network whose objective function is an OPE estimator. Note -------- The neural network is implemented in PyTorch. Parameters ----------- n_actions: int Number of actions. len_list: int, default=1 Length of a list of actions recommended in each impression. When Open Bandit Dataset is used, 3 should be set. Currently, len_list > 1 is not supported. dim_context: int Number of dimensions of context vectors. off_policy_objective: Callable[[VarArg[Any]], Tensor] Function returns the value of an OPE estimator. `BaseOffPolicyEstimator.estimate_policy_value_tensor` is supposed to be given here. hidden_layer_size: Tuple[int, ...], default = (100,) The i th element specifies the size of the i th layer. activation: str, default='relu' Activation function. Must be one of the following: - 'identity', the identity function, :math:`f(x) = x`. - 'logistic', the sigmoid function, :math:`f(x) = \\frac{1}{1 + \\exp(x)}`. - 'tanh', the hyperbolic tangent function, `:math:f(x) = \\frac{\\exp(x) - \\exp(-x)}{\\exp(x) + \\exp(-x)}` - 'relu', the rectfiied linear unit function, `:math:f(x) = \\max(0, x)` solver: str, default='adam' Optimizer of the neural network. Must be one of the following: - 'lbfgs', L-BFGS algorithm (Liu and Nocedal 1989). - 'sgd', stochastic gradient descent (SGD). - 'adam', Adam (Kingma and Ba 2014). alpha: float, default=0.001 L2 penalty. bacth_size: Union[int, str], default="auto" Batch size for SGD and Adam. If "auto", the maximum of 200 and the number of samples is used. If integer, must be positive. learning_rate_init: int, default=0.0001 Initial learning rate for SGD and Adam. max_iter: int, default=200 Maximum number of iterations for L-BFGS. Number of epochs for SGD and Adam. shuffle: bool, default=True Whether to shuffle samples in SGD and Adam. random_state: Optional[int], default=None Controls the random seed. tol: float, default=1e-4 Tolerance for trainning. When the training loss is not improved at least `tol' for `n_iter_no_change' consecutive iterations, training is stopped. momentum: float, default=0.9 Momentum for SGD. Must be in the range of [0., 1.]. nesterovs_moemntum: bool, default=True Whether to use Nestrov momentum. early_stopping: bool, default=False Whether to use early stopping for SGD and Adam. If set to trure, `validation_fraction' of training data is used as validation data, and training is stopped when the validation loss is not imporved at least `tol' for `n_iter_no_change' consecutive iterations. validation_fraction: float, default=0.1 Fraction of validation data when early stopping is used. Must be in the range of (0., 1.]. beta_1: float, default=0.9 Coefficient used for computing running average of gradient for Adam. Must be in the range of [0., 1.]. beta_2: float, default=0.999 Coefficient used for computing running average of the square of gradient for Adam. Must be in the range of [0., 1.]. epsilon: float, default=1e-8 Term for numerical stability in Adam. n_iter_no_change: int, default=10 Maximum number of not improving epochs when early stopping is used. max_fun: int, default=15000 Maximum number of function calls per step in L-BFGS. References: ------------ Dong .C. Liu and Jorge Nocedal. "On the Limited Memory Method for Large Scale Optimization.", 1989 Diederik P. Kingma and Jimmy Ba. "Adam: A Method for Stochastic Optimization.", 2014 """ dim_context: Optional[int] = None off_policy_objective: Optional[Callable[[mx.VarArg(Any)], torch.Tensor]] = None hidden_layer_size: Tuple[int, ...] = (100, ) activation: str = "relu" solver: str = "adam" alpha: float = 0.0001 batch_size: Union[int, str] = "auto" learning_rate_init: float = 0.0001 max_iter: int = 200 shuffle: bool = True random_state: Optional[int] = None tol: float = 1e-4 momentum: float = 0.9 nesterovs_momentum: bool = True early_stopping: bool = False validation_fraction: float = 0.1 beta_1: float = 0.9 beta_2: float = 0.999 epsilon: float = 1e-8 n_iter_no_change: int = 10 max_fun: int = 15000 def __post_init__(self) -> None: """Initialize class.""" super().__post_init__() if self.len_list != 1: raise NotImplementedError( "currently, len_list > 1 is not supported") if not isinstance(self.dim_context, int) or self.dim_context <= 0: raise ValueError( f"dim_context must be a positive integer, but {self.dim_context} is given" ) if not callable(self.off_policy_objective): raise ValueError( f"off_policy_objective must be callable, but {self.off_policy_objective} is given" ) if not isinstance(self.hidden_layer_size, tuple) or any( [not isinstance(h, int) or h <= 0 for h in self.hidden_layer_size]): raise ValueError( f"hidden_layer_size must be tuple of positive integers, but {self.hidden_layer_size} is given" ) if self.solver not in ("lbfgs", "sgd", "adam"): raise ValueError( f"solver must be one of 'adam', 'lbfgs', or 'sgd', but {self.solver} is given" ) if not isinstance(self.alpha, float) or self.alpha < 0.0: raise ValueError( f"alpha must be a nonnegative float, but {self.alpha} is given" ) if self.batch_size != "auto" and (not isinstance(self.batch_size, int) or self.batch_size <= 0): raise ValueError( f"batch_size must be a positive integer or 'auto', but {self.batch_size} is given" ) if (not isinstance(self.learning_rate_init, float) or self.learning_rate_init <= 0.0): raise ValueError( f"learning_rate_init must be a positive float, but {self.learning_rate_init} is given" ) if not isinstance(self.max_iter, int) or self.max_iter <= 0: raise ValueError( f"max_iter must be a positive integer, but {self.max_iter} is given" ) if not isinstance(self.shuffle, bool): raise ValueError( f"shuffle must be a bool, but {self.shuffle} is given") if not isinstance(self.tol, float) or self.tol <= 0.0: raise ValueError( f"tol must be a positive float, but {self.tol} is given") if not isinstance(self.momentum, float) or not 0.0 <= self.momentum <= 1.0: raise ValueError( f"momentum must be a float in [0., 1.], but {self.momentum} is given" ) if not isinstance(self.nesterovs_momentum, bool): raise ValueError( f"nestrovs_momentum must be a bool, but {self.nesterovs_momentum} is given" ) if not isinstance(self.early_stopping, bool): raise ValueError( f"early_stopping must be a bool, but {self.early_stopping} is given" ) if self.early_stopping and self.solver not in ("sgd", "adam"): raise ValueError( f"if early_stopping is True, solver must be one of 'sgd' or 'adam', but {self.solver} is given" ) if (not isinstance(self.validation_fraction, float) or not 0.0 < self.validation_fraction <= 1.0): raise ValueError( f"validation_fraction must be a float in (0., 1.], but {self.validation_fraction} is given" ) if not isinstance(self.beta_1, float) or not 0.0 <= self.beta_1 <= 1.0: raise ValueError( f"beta_1 must be a float in [0. 1.], but {self.beta_1} is given" ) if not isinstance(self.beta_2, float) or not 0.0 <= self.beta_2 <= 1.0: raise ValueError( f"beta_2 must be a float in [0., 1.], but {self.beta_2} is given" ) if not isinstance(self.beta_2, float) or not 0.0 <= self.beta_2 <= 1.0: raise ValueError( f"beta_2 must be a float in [0., 1.], but {self.beta_2} is given" ) if not isinstance(self.epsilon, float) or self.epsilon < 0.0: raise ValueError( f"epsilon must be a nonnegative float, but {self.epsilon} is given" ) if not isinstance(self.n_iter_no_change, int) or self.n_iter_no_change <= 0: raise ValueError( f"n_iter_no_change must be a positive integer, but {self.n_iter_no_change} is given" ) if not isinstance(self.max_fun, int) or self.max_fun <= 0: raise ValueError( f"max_fun must be a positive integer, but {self.max_fun} is given" ) if self.random_state is not None: if isinstance(self.random_state, int): torch.manual_seed(self.random_state) else: raise ValueError( f"random_state must be None or an integer, but {self.random_state} is given" ) if self.activation == "identity": activation_layer = nn.Identity elif self.activation == "logistic": activation_layer = nn.Sigmoid elif self.activation == "tanh": activation_layer = nn.Tanh elif self.activation == "relu": activation_layer = nn.ReLU else: raise ValueError( f"activation must be one of 'identity', 'logistic', 'tanh', or 'relu', but {self.activation} is given" ) layer_list = [] input_size = self.dim_context for i, h in enumerate(self.hidden_layer_size): layer_list.append(("l{}".format(i), nn.Linear(input_size, h))) layer_list.append(("a{}".format(i), activation_layer())) input_size = h layer_list.append(("output", nn.Linear(input_size, self.n_actions))) layer_list.append(("softmax", nn.Softmax(dim=1))) self.nn_model = nn.Sequential(OrderedDict(layer_list)) def _create_train_data_for_opl( self, context: np.ndarray, action: np.ndarray, reward: np.ndarray, pscore: np.ndarray, estimated_rewards_by_reg_model: np.ndarray, position: np.ndarray, **kwargs, ) -> Tuple[torch.utils.data.DataLoader, Optional[torch.utils.data.DataLoader]]: """Create training data for off-policy learning. Parameters ----------- context: array-like, shape (n_rounds, dim_context) Context vectors in each round, i.e., :math:`x_t`. action: array-like, shape (n_rounds,) Action sampled by a behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. reward: array-like, shape (n_rounds,) Observed rewards (or outcome) in each round, i.e., :math:`r_t`. pscore: array-like, shape (n_rounds,), default=None Propensity scores, the probability of selecting each action by behavior policy, in the given logged bandit feedback. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list), default=None Expected rewards for each round, action, and position estimated by a regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. position: array-like, shape (n_rounds,), default=None Positions of each round in the given logged bandit feedback. If None is given, a learner assumes that there is only one position. Returns -------- (training_data_loader, validation_data_loader): Tuple[DataLoader, Optional[DataLoader]] Training and validation data loaders in PyTorch """ if self.batch_size == "auto": batch_size_ = min(200, context.shape[0]) elif isinstance(self.batch_size, int) and self.batch_size > 0: batch_size_ = self.batch_size else: raise ValueError("batch_size must be a positive integer or 'auto'") dataset = NNPolicyDataset( torch.from_numpy(context).float(), action, torch.from_numpy(reward).float(), torch.from_numpy(pscore).float(), torch.from_numpy(estimated_rewards_by_reg_model).float(), position, ) if self.early_stopping: if context.shape[0] <= 1: raise ValueError( f"the number of samples is too small ({context.shape[0]}) to create validation data" ) validation_size = max( int(context.shape[0] * self.validation_fraction), 1) training_size = context.shape[0] - validation_size training_dataset, validation_dataset = torch.utils.data.random_split( dataset, [training_size, validation_size]) training_data_loader = torch.utils.data.DataLoader( training_dataset, batch_size=batch_size_, shuffle=self.shuffle, ) validation_data_loader = torch.utils.data.DataLoader( validation_dataset, batch_size=batch_size_, shuffle=self.shuffle, ) return training_data_loader, validation_data_loader data_loader = torch.utils.data.DataLoader( dataset, batch_size=batch_size_, shuffle=self.shuffle, ) return data_loader, None def fit( self, context: np.ndarray, action: np.ndarray, reward: np.ndarray, pscore: Optional[np.ndarray] = None, estimated_rewards_by_reg_model: Optional[np.ndarray] = None, position: Optional[np.ndarray] = None, ) -> None: """Fits an offline bandit policy using the given logged bandit feedback data. Note ---------- Given the training data :math:`\\mathcal{D}`, this policy maximizes the following objective function: .. math:: \\hat{V}(\\pi_\\theta; \\mathcal{D}) - \\alpha \\Omega(\\theta) where :math:`\\hat{V}` is an OPE estimator and :math:`\\alpha \\Omega(\\theta)` is a regularization term. Parameters ----------- context: array-like, shape (n_rounds, dim_context) Context vectors in each round, i.e., :math:`x_t`. action: array-like, shape (n_rounds,) Action sampled by a behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`. reward: array-like, shape (n_rounds,) Observed rewards (or outcome) in each round, i.e., :math:`r_t`. pscore: array-like, shape (n_rounds,), default=None Action choice probabilities by a behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`. estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list), default=None Expected rewards for each round, action, and position estimated by a regression model, i.e., :math:`\\hat{q}(x_t,a_t)`. If None is given, a learner assumes that the estimated rewards are zero. position: array-like, shape (n_rounds,), default=None Positions of each round in the given logged bandit feedback. If None is given, a learner assumes that there is only one position. When `len_list` > 1, position has to be set. Currently, this feature is not supported. """ check_bandit_feedback_inputs( context=context, action=action, reward=reward, pscore=pscore, position=position, ) if context.shape[1] != self.dim_context: raise ValueError( "the second dimension of context must be equal to dim_context") if pscore is None: pscore = np.ones_like(action) / self.n_actions if estimated_rewards_by_reg_model is None: estimated_rewards_by_reg_model = np.zeros( (context.shape[0], self.n_actions, self.len_list)) if self.len_list == 1: position = np.zeros_like(action, dtype=int) else: raise NotImplementedError( "currently, len_list > 1 is not supported") if self.solver == "lbfgs": optimizer = optim.LBFGS( self.nn_model.parameters(), lr=self.learning_rate_init, max_iter=self.max_iter, max_eval=self.max_fun, ) elif self.solver == "sgd": optimizer = optim.SGD( self.nn_model.parameters(), lr=self.learning_rate_init, momentum=self.momentum, weight_decay=self.alpha, nesterov=self.nesterovs_momentum, ) elif self.solver == "adam": optimizer = optim.Adam( self.nn_model.parameters(), lr=self.learning_rate_init, betas=(self.beta_1, self.beta_2), eps=self.epsilon, weight_decay=self.alpha, ) else: raise NotImplementedError( "solver must be one of 'adam', 'lbfgs', or 'sgd'") training_data_loader, validation_data_loader = self._create_train_data_for_opl( context, action, reward, pscore, estimated_rewards_by_reg_model, position) if self.solver == "lbfgs": for x, a, r, p, q_hat, pos in training_data_loader: def closure(): optimizer.zero_grad() action_dist = self.nn_model(x).unsqueeze(-1) loss = -1.0 * self.off_policy_objective( reward=r, action=a, pscore=p, action_dist=action_dist, estimated_rewards_by_reg_model=q_hat, position=pos, ) loss.backward() return loss optimizer.step(closure) if self.solver in ("sgd", "adam"): n_not_improving_training = 0 previous_training_loss = None n_not_improving_validation = 0 previous_validation_loss = None for _ in np.arange(self.max_iter): self.nn_model.train() for x, a, r, p, q_hat, pos in training_data_loader: optimizer.zero_grad() action_dist = self.nn_model(x).unsqueeze(-1) loss = -1.0 * self.off_policy_objective( reward=r, action=a, pscore=p, action_dist=action_dist, estimated_rewards_by_reg_model=q_hat, position=pos, ) loss.backward() optimizer.step() loss_value = loss.item() if previous_training_loss is not None: if loss_value - previous_training_loss < self.tol: n_not_improving_training += 1 else: n_not_improving_training = 0 if n_not_improving_training >= self.n_iter_no_change: break previous_training_loss = loss_value if self.early_stopping: self.nn_model.eval() for x, a, r, p, q_hat, pos in validation_data_loader: action_dist = self.nn_model(x).unsqueeze(-1) loss = -1.0 * self.off_policy_objective( reward=r, action=a, pscore=p, action_dist=action_dist, estimated_rewards_by_reg_model=q_hat, position=pos, ) loss_value = loss.item() if previous_validation_loss is not None: if loss_value - previous_validation_loss < self.tol: n_not_improving_validation += 1 else: n_not_improving_validation = 0 if n_not_improving_validation > self.n_iter_no_change: break previous_validation_loss = loss_value def predict(self, context: np.ndarray) -> np.ndarray: """Predict best actions for new data. Note -------- Action set predicted by this `predict` method can contain duplicate items. If you want a non-repetitive action set, then please use the `sample_action` method. Parameters ----------- context: array-like, shape (n_rounds_of_new_data, dim_context) Context vectors for new data. Returns ----------- action_dist: array-like, shape (n_rounds_of_new_data, n_actions, len_list) Action choices by a classifier, which can contain duplicate items. If you want a non-repetitive action set, please use the `sample_action` method. """ if not isinstance(context, np.ndarray) or context.ndim != 2: raise ValueError("context must be 2-dimensional ndarray") if context.shape[1] != self.dim_context: raise ValueError( "the second dimension of context must be equal to dim_context") self.nn_model.eval() x = torch.from_numpy(context).float() y = self.nn_model(x).detach().numpy() predicted_actions = np.argmax(y, axis=1) n_rounds = context.shape[0] action_dist = np.zeros((n_rounds, self.n_actions, 1)) action_dist[np.arange(n_rounds), predicted_actions, 0] = 1 return action_dist def sample_action( self, context: np.ndarray, random_state: Optional[int] = None, ) -> np.ndarray: """Sample (non-repetitive) actions based on action choice probabilities. Parameters ---------------- context: array-like, shape (n_rounds_of_new_data, dim_context) Context vectors for new data. random_state: int, default=None Controls the random seed in sampling actions. Returns ----------- action: array-like, shape (n_rounds_of_new_data, n_actions, len_list) Action sampled by a trained classifier. """ if not isinstance(context, np.ndarray) or context.ndim != 2: raise ValueError("context must be 2-dimensional ndarray") if context.shape[1] != self.dim_context: raise ValueError( "the second dimension of context must be equal to dim_context") n_rounds = context.shape[0] random_ = check_random_state(random_state) action = np.zeros((n_rounds, self.n_actions, self.len_list)) score_predicted = self.predict_proba(context=context) for i in tqdm(np.arange(n_rounds), desc="[sample_action]", total=n_rounds): action_set = np.arange(self.n_actions) for position_ in np.arange(self.len_list): score_ = score_predicted[i, action_set, position_] action_sampled = random_.choice(action_set, p=score_, replace=False) action[i, action_sampled, position_] = 1 action_set = np.delete(action_set, action_set == action_sampled) return action def predict_proba( self, context: np.ndarray, ) -> np.ndarray: """Obtains action choice probabilities for new data. Note -------- This policy uses multi-layer perceptron (MLP) and the softmax function as the last layer. This is a stochastic policy and represented as follows: .. math:: \\pi_\\theta (a \\mid x) = \\frac{\\exp(f_\\theta(x, a))}{\\sum_{a' \\in \\mathcal{A}} \\exp(f_\\theta(x, a'))} where :math:`f__\\theta(x, a)` is MLP with parameter :math:`\\theta`. Parameters ---------------- context: array-like, shape (n_rounds_of_new_data, dim_context) Context vectors for new data. Returns ----------- choice_prob: array-like, shape (n_rounds_of_new_data, n_actions, len_list) Action choice probabilities obtained by a trained classifier. """ if not isinstance(context, np.ndarray) or context.ndim != 2: raise ValueError("context must be 2-dimensional ndarray") if context.shape[1] != self.dim_context: raise ValueError( "the second dimension of context must be equal to dim_context") self.nn_model.eval() x = torch.from_numpy(context).float() y = self.nn_model(x).detach().numpy() return y[:, :, np.newaxis]