def handle(self, tdp: TrainingDataPage) -> None: if not self.trainer.calc_cpe_in_training: return if isinstance(tdp, TrainingDataPage): if isinstance(self.trainer, DQNTrainer): # This is required until we get rid of TrainingDataPage if self.trainer.maxq_learning: edp = EvaluationDataPage.create_from_training_batch( tdp.as_discrete_maxq_training_batch(), self.trainer) else: edp = EvaluationDataPage.create_from_training_batch( tdp.as_discrete_sarsa_training_batch(), self.trainer) else: edp = EvaluationDataPage.create_from_tdp(tdp, self.trainer) elif isinstance(tdp, TrainingBatch): if isinstance(self.trainer, SACTrainer): # TODO: Implement CPE for continuous algos edp = None else: edp = EvaluationDataPage.create_from_training_batch( tdp, self.trainer) if self.evaluation_data is None: self.evaluation_data = edp else: self.evaluation_data = self.evaluation_data.append(edp)
def handle(self, tdp: TrainingBatch) -> None: if not self.trainer.calc_cpe_in_training: return if isinstance(tdp, TrainingBatch): if isinstance(self.trainer, DQNTrainer): # This is required until we get rid of TrainingBatch if self.trainer.maxq_learning: edp = EvaluationDataPage.create_from_training_batch( tdp, self.trainer ) else: edp = EvaluationDataPage.create_from_training_batch( tdp, self.trainer ) else: edp = EvaluationDataPage.create_from_training_batch(tdp, self.trainer) elif isinstance(tdp, TrainingBatch): # TODO: Perhaps we can make an RLTrainer param to check if continuous? if isinstance(self.trainer, SACTrainer): # TODO: Implement CPE for continuous algos edp = None else: edp = EvaluationDataPage.create_from_training_batch(tdp, self.trainer) if self.evaluation_data is None: self.evaluation_data = edp else: self.evaluation_data = self.evaluation_data.append(edp)
def evaluate_post_training(self, edp: EvaluationDataPage) -> CpeDetails: cpe_details = CpeDetails() self.score_cpe("Reward", edp, cpe_details.reward_estimates) if ( self.metrics_to_score is not None and edp.logged_metrics is not None and self.action_names is not None ): for i, metric in enumerate(self.metrics_to_score): logger.info( "--------- Running CPE on metric: {} ---------".format(metric) ) metric_reward_edp = edp.set_metric_as_reward(i, len(self.action_names)) cpe_details.metric_estimates[metric] = CpeEstimateSet() self.score_cpe( metric, metric_reward_edp, cpe_details.metric_estimates[metric] ) # Compute MC Loss on Aggregate Reward cpe_details.mc_loss = float( torch.mean(torch.abs(edp.logged_values - edp.model_values)) ) return cpe_details
def create_edp(self, environment, samples, epsilon_model): """Generate a EvaluationDataPage such that the model policy is epsilon greedy with parameter epsilon_model. The true values of this policy are used for the model_values* data. """ tdp = environment.preprocess_samples(samples, len(samples.mdp_ids), do_shuffle=False)[0] # compute rewards, probs, values for all actions of each sampled state model_rewards = environment.true_rewards_all_actions_for_sample( samples.states) model_propensities = environment.policy_probabilities_for_sample( samples.states, epsilon_model) model_values = environment.true_epsilon_values_all_actions_for_sample( samples.states, epsilon_model) # compute rewards for logged action model_rewards_logged_action = environment.true_rewards_for_sample( samples.states, samples.actions) edp = EvaluationDataPage( mdp_id=np.array(samples.mdp_ids).reshape(-1, 1), sequence_number=torch.tensor(samples.sequence_numbers, dtype=torch.int), logged_propensities=tdp.propensities, logged_rewards=tdp.rewards, action_mask=tdp.actions, model_propensities=torch.tensor(model_propensities, dtype=torch.float32), model_rewards=torch.tensor(model_rewards, dtype=torch.float32), model_rewards_for_logged_action=torch.tensor( model_rewards_logged_action, dtype=torch.float32), model_values=torch.tensor(model_values, dtype=torch.float32), model_values_for_logged_action=None, possible_actions_mask=tdp.possible_actions_mask, ) return edp
def handle(self, tdp: TrainingDataPage) -> None: if not self.trainer.calc_cpe_in_training: return if isinstance(tdp, TrainingDataPage): edp = EvaluationDataPage.create_from_tdp(tdp, self.trainer) elif isinstance(tdp, TrainingBatch): if isinstance(self.trainer, (_DQNTrainer, SACTrainer)): # TODO: Implement CPE for modular DQNTrainer & continuous algos edp = None else: edp = EvaluationDataPage.create_from_training_batch( tdp, self.trainer) if self.evaluation_data is None: self.evaluation_data = edp else: self.evaluation_data = self.evaluation_data.append(edp)
def evaluate(self, eval_tdp: PreprocessedTrainingBatch): seq2slate_net = self.trainer.seq2slate_net baseline_net = self.trainer.baseline_net seq2slate_net_prev_mode = seq2slate_net.training baseline_net_prev_mode = baseline_net.training seq2slate_net.eval() baseline_net.eval() log_prob = (seq2slate_net(eval_tdp.training_input, mode=Seq2SlateMode.PER_SEQ_LOG_PROB_MODE). log_probs.detach().flatten().cpu().numpy()) b = baseline_net(eval_tdp.training_input).squeeze().detach() advantage = (eval_tdp.training_input.slate_reward - b).flatten().cpu().numpy() self.baseline_loss.append( F.mse_loss(b, eval_tdp.training_input.slate_reward).item()) self.advantages.append(advantage) self.log_probs.append(log_prob) seq2slate_net.train(seq2slate_net_prev_mode) baseline_net.train(baseline_net_prev_mode) if not self.calc_cpe: return edp = EvaluationDataPage.create_from_training_batch( eval_tdp, self.trainer, self.reward_network) if self.eval_data_pages is None: self.eval_data_pages = edp else: self.eval_data_pages = self.eval_data_pages.append(edp)
def handle(self, tdp: PreprocessedTrainingBatch) -> None: if not self.trainer.calc_cpe_in_training: return # TODO: Perhaps we can make an RLTrainer param to check if continuous? if isinstance(self.trainer, (SACTrainer, TD3Trainer)): # TODO: Implement CPE for continuous algos edp = None else: edp = EvaluationDataPage.create_from_training_batch(tdp, self.trainer) if self.evaluation_data is None: self.evaluation_data = edp else: self.evaluation_data = self.evaluation_data.append(edp)
def evaluate_post_training(self, edp: EvaluationDataPage) -> CpeDetails: cpe_details = CpeDetails() cpe_details.reward_estimates = self.score_cpe("Reward", edp) if (self.metrics_to_score is not None and edp.logged_metrics is not None and self.action_names is not None): for i, metric in enumerate(self.metrics_to_score): logger.info( "--------- Running CPE on metric: {} ---------".format( metric)) metric_reward_edp = edp.set_metric_as_reward( i, len(self.action_names)) cpe_details.metric_estimates[metric] = self.score_cpe( metric, metric_reward_edp) if self.action_names is not None: if edp.optimal_q_values is not None: value_means = edp.optimal_q_values.mean(dim=0) cpe_details.q_value_means = { action: float(value_means[i]) for i, action in enumerate(self.action_names) } value_stds = edp.optimal_q_values.std(dim=0) # type: ignore cpe_details.q_value_stds = { action: float(value_stds[i]) for i, action in enumerate(self.action_names) } if edp.eval_action_idxs is not None: cpe_details.action_distribution = { action: float( (edp.eval_action_idxs == i).sum()) # type: ignore / edp.eval_action_idxs.shape[0] for i, action in enumerate(self.action_names) } # Compute MC Loss on Aggregate Reward cpe_details.mc_loss = float( torch.mean(torch.abs(edp.logged_values - edp.model_values)) # type: ignore ) return cpe_details
def evaluate_post_training(self, edp: EvaluationDataPage) -> CpeDetails: cpe_details = CpeDetails() self.score_cpe("Reward", edp, cpe_details.reward_estimates) if self.metrics_to_score is not None: for i, metric in enumerate(self.metrics_to_score): logger.info( "--------- Running CPE on metric: {} ---------".format( metric)) metric_reward_edp = edp.set_metric_as_reward( i, len(self.action_names)) cpe_details.metric_estimates[metric] = CpeEstimateSet() self.score_cpe(metric, metric_reward_edp, cpe_details.metric_estimates[metric]) # Compute MC Loss on Aggregate Reward cpe_details.mc_loss = float( torch.mean(torch.abs(edp.logged_values - edp.model_values))) return cpe_details
def train_network(params): writer = None if params["model_output_path"] is not None: writer = SummaryWriter(log_dir=params["model_output_path"]) logger.info("Running DQN workflow with params:") logger.info(params) # Set minibatch size based on # of devices being used to train params["training"]["minibatch_size"] *= minibatch_size_multiplier( params["use_gpu"], params["use_all_avail_gpus"]) action_names = np.array(params["actions"]) rl_parameters = RLParameters(**params["rl"]) training_parameters = TrainingParameters(**params["training"]) rainbow_parameters = RainbowDQNParameters(**params["rainbow"]) trainer_params = DiscreteActionModelParameters( actions=params["actions"], rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters, ) dataset = JSONDataset(params["training_data_path"], batch_size=training_parameters.minibatch_size) eval_dataset = JSONDataset(params["eval_data_path"], batch_size=16) state_normalization = read_norm_file(params["state_norm_data_path"]) num_batches = int(len(dataset) / training_parameters.minibatch_size) logger.info("Read in batch data set {} of size {} examples. Data split " "into {} batches of size {}.".format( params["training_data_path"], len(dataset), num_batches, training_parameters.minibatch_size, )) trainer = DQNTrainer( trainer_params, state_normalization, use_gpu=params["use_gpu"], use_all_avail_gpus=params["use_all_avail_gpus"], ) trainer = update_model_for_warm_start(trainer) preprocessor = Preprocessor(state_normalization, False) evaluator = Evaluator( trainer_params.actions, trainer_params.rl.gamma, trainer, metrics_to_score=trainer.metrics_to_score, ) start_time = time.time() for epoch in range(int(params["epochs"])): dataset.reset_iterator() for batch_idx in range(num_batches): report_training_status(batch_idx, num_batches, epoch, int(params["epochs"])) batch = dataset.read_batch(batch_idx) tdp = preprocess_batch_for_training(preprocessor, batch, action_names) tdp.set_type(trainer.dtype) trainer.train(tdp) eval_dataset.reset_iterator() accumulated_edp = None while True: batch = eval_dataset.read_batch(batch_idx) if batch is None: break tdp = preprocess_batch_for_training(preprocessor, batch, action_names) edp = EvaluationDataPage.create_from_tdp(tdp, trainer) if accumulated_edp is None: accumulated_edp = edp else: accumulated_edp = accumulated_edp.append(edp) accumulated_edp = accumulated_edp.compute_values(trainer.gamma) cpe_start_time = time.time() details = evaluator.evaluate_post_training(accumulated_edp) details.log() logger.info("CPE evaluation took {} seconds.".format(time.time() - cpe_start_time)) through_put = (len(dataset) * int(params["epochs"])) / (time.time() - start_time) logger.info("Training finished. Processed ~{} examples / s.".format( round(through_put))) if writer is not None: writer.close() return export_trainer_and_predictor(trainer, params["model_output_path"])
def train_network(params): logger.info("Running Parametric DQN workflow with params:") logger.info(params) # Set minibatch size based on # of devices being used to train params["training"]["minibatch_size"] *= minibatch_size_multiplier( params["use_gpu"], params["use_all_avail_gpus"]) rl_parameters = RLParameters(**params["rl"]) training_parameters = TrainingParameters(**params["training"]) rainbow_parameters = RainbowDQNParameters(**params["rainbow"]) if params["in_training_cpe"] is not None: in_training_cpe_parameters = InTrainingCPEParameters( **params["in_training_cpe"]) else: in_training_cpe_parameters = None trainer_params = ContinuousActionModelParameters( rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters, in_training_cpe=in_training_cpe_parameters, ) dataset = JSONDataset(params["training_data_path"], batch_size=training_parameters.minibatch_size) eval_dataset = JSONDataset(params["eval_data_path"], batch_size=training_parameters.minibatch_size) state_normalization = read_norm_file(params["state_norm_data_path"]) action_normalization = read_norm_file(params["action_norm_data_path"]) num_batches = int(len(dataset) / training_parameters.minibatch_size) logger.info("Read in batch data set {} of size {} examples. Data split " "into {} batches of size {}.".format( params["training_data_path"], len(dataset), num_batches, training_parameters.minibatch_size, )) trainer = ParametricDQNTrainer( trainer_params, state_normalization, action_normalization, use_gpu=params["use_gpu"], use_all_avail_gpus=params["use_all_avail_gpus"], ) trainer = update_model_for_warm_start(trainer) state_preprocessor = Preprocessor(state_normalization, False) action_preprocessor = Preprocessor(action_normalization, False) if trainer_params.in_training_cpe is not None: evaluator = Evaluator( None, trainer_params.rl.gamma, trainer, trainer_params.in_training_cpe.mdp_sampled_rate, metrics_to_score=trainer.metrics_to_score, ) else: evaluator = Evaluator( None, trainer_params.rl.gamma, trainer, float(DEFAULT_NUM_SAMPLES_FOR_CPE) / len(dataset), metrics_to_score=trainer.metrics_to_score, ) start_time = time.time() for epoch in range(params["epochs"]): dataset.reset_iterator() for batch_idx in range(num_batches): report_training_status(batch_idx, num_batches, epoch, params["epochs"]) batch = dataset.read_batch(batch_idx) tdp = preprocess_batch_for_training( state_preprocessor, batch, action_preprocessor=action_preprocessor) tdp.set_type(trainer.dtype) trainer.train(tdp) eval_dataset.reset_iterator() accumulated_edp = None for batch_idx in range(num_batches): batch = eval_dataset.read_batch(batch_idx) tdp = preprocess_batch_for_training( state_preprocessor, batch, action_preprocessor=action_preprocessor) edp = EvaluationDataPage.create_from_tdp(tdp, trainer) if accumulated_edp is None: accumulated_edp = edp else: accumulated_edp = accumulated_edp.append(edp) accumulated_edp = accumulated_edp.compute_values(trainer.gamma) cpe_start_time = time.time() details = evaluator.evaluate_post_training(accumulated_edp) details.log() logger.info("CPE evaluation took {} seconds.".format(time.time() - cpe_start_time)) through_put = (len(dataset) * params["epochs"]) / (time.time() - start_time) logger.info("Training finished. Processed ~{} examples / s.".format( round(through_put))) return export_trainer_and_predictor(trainer, params["model_output_path"])
def test_seq2slate_eval_data_page(self): """ Create 3 slate ranking logs and evaluate using Direct Method, Inverse Propensity Scores, and Doubly Robust. The logs are as follows: state: [1, 0, 0], [0, 1, 0], [0, 0, 1] indices in logged slates: [3, 2], [3, 2], [3, 2] model output indices: [2, 3], [3, 2], [2, 3] logged reward: 4, 5, 7 logged propensities: 0.2, 0.5, 0.4 predicted rewards on logged slates: 2, 4, 6 predicted rewards on model outputted slates: 1, 4, 5 Direct Method uses the predicted rewards on model outputted slates. Thus the result is expected to be (1 + 4 + 5) / 3 Inverse Propensity Scores would scale the reward by 1.0 / logged propensities whenever the model output slate matches with the logged slate. Since only the second log matches with the model output, the IPS result is expected to be 5 / 0.5 / 3 Doubly Robust is the sum of the direct method result and propensity-scaled reward difference; the latter is defined as: 1.0 / logged_propensities * (logged reward - predicted reward on logged slate) * Indicator(model slate == logged slate) Since only the second logged slate matches with the model outputted slate, the DR result is expected to be (1 + 4 + 5) / 3 + 1.0 / 0.5 * (5 - 4) / 3 """ batch_size = 3 state_dim = 3 src_seq_len = 2 tgt_seq_len = 2 candidate_dim = 2 reward_net = FakeSeq2SlateRewardNetwork() seq2slate_net = FakeSeq2SlateTransformerNet() baseline_net = nn.Linear(1, 1) trainer = Seq2SlateTrainer( seq2slate_net, baseline_net, parameters=None, minibatch_size=3, use_gpu=False, ) src_seq = torch.eye(candidate_dim).repeat(batch_size, 1, 1) tgt_out_idx = torch.LongTensor([[3, 2], [3, 2], [3, 2]]) tgt_out_seq = src_seq[torch.arange(batch_size). repeat_interleave(tgt_seq_len), # type: ignore tgt_out_idx.flatten() - 2, ].reshape( batch_size, tgt_seq_len, candidate_dim) ptb = rlt.PreprocessedTrainingBatch( training_input=rlt.PreprocessedRankingInput( state=rlt.PreprocessedFeatureVector( float_features=torch.eye(state_dim)), src_seq=rlt.PreprocessedFeatureVector(float_features=src_seq), tgt_out_seq=rlt.PreprocessedFeatureVector( float_features=tgt_out_seq), src_src_mask=torch.ones(batch_size, src_seq_len, src_seq_len), tgt_out_idx=tgt_out_idx, tgt_out_probs=torch.tensor([0.2, 0.5, 0.4]), slate_reward=torch.tensor([4.0, 5.0, 7.0]), ), extras=rlt.ExtraData( sequence_number=torch.tensor([0, 0, 0]), mdp_id=np.array(["0", "1", "2"]), ), ) edp = EvaluationDataPage.create_from_training_batch( ptb, trainer, reward_net) doubly_robust_estimator = DoublyRobustEstimator() direct_method, inverse_propensity, doubly_robust = doubly_robust_estimator.estimate( edp) logger.info(f"{direct_method}, {inverse_propensity}, {doubly_robust}") avg_logged_reward = (4 + 5 + 7) / 3 self.assertAlmostEqual(direct_method.raw, (1 + 4 + 5) / 3, delta=1e-6) self.assertAlmostEqual(direct_method.normalized, direct_method.raw / avg_logged_reward, delta=1e-6) self.assertAlmostEqual(inverse_propensity.raw, 5 / 0.5 / 3, delta=1e-6) self.assertAlmostEqual( inverse_propensity.normalized, inverse_propensity.raw / avg_logged_reward, delta=1e-6, ) self.assertAlmostEqual(doubly_robust.raw, direct_method.raw + 1 / 0.5 * (5 - 4) / 3, delta=1e-6) self.assertAlmostEqual(doubly_robust.normalized, doubly_robust.raw / avg_logged_reward, delta=1e-6)
def train_network(params): writer = None if params["model_output_path"] is not None: writer = SummaryWriter(log_dir=params["model_output_path"]) logger.info("Running DQN workflow with params:") logger.info(params) # Set minibatch size based on # of devices being used to train params["training"]["minibatch_size"] *= minibatch_size_multiplier( params["use_gpu"], params["use_all_avail_gpus"] ) action_names = np.array(params["actions"]) rl_parameters = RLParameters(**params["rl"]) training_parameters = TrainingParameters(**params["training"]) rainbow_parameters = RainbowDQNParameters(**params["rainbow"]) trainer_params = DiscreteActionModelParameters( actions=params["actions"], rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters, ) dataset = JSONDataset( params["training_data_path"], batch_size=training_parameters.minibatch_size ) eval_dataset = JSONDataset(params["eval_data_path"], batch_size=16) state_normalization = read_norm_file(params["state_norm_data_path"]) num_batches = int(len(dataset) / training_parameters.minibatch_size) logger.info( "Read in batch data set {} of size {} examples. Data split " "into {} batches of size {}.".format( params["training_data_path"], len(dataset), num_batches, training_parameters.minibatch_size, ) ) trainer = DQNTrainer( trainer_params, state_normalization, use_gpu=params["use_gpu"], use_all_avail_gpus=params["use_all_avail_gpus"], ) trainer = update_model_for_warm_start(trainer) preprocessor = Preprocessor(state_normalization, False) evaluator = Evaluator( trainer_params.actions, trainer_params.rl.gamma, trainer, metrics_to_score=trainer.metrics_to_score, ) start_time = time.time() for epoch in range(int(params["epochs"])): dataset.reset_iterator() batch_idx = -1 while True: batch_idx += 1 report_training_status(batch_idx, num_batches, epoch, int(params["epochs"])) batch = dataset.read_batch() if batch is None: break tdp = preprocess_batch_for_training(preprocessor, batch, action_names) tdp.set_type(trainer.dtype) trainer.train(tdp) eval_dataset.reset_iterator() accumulated_edp = None while True: batch = eval_dataset.read_batch() if batch is None: break tdp = preprocess_batch_for_training(preprocessor, batch, action_names) tdp.set_type(trainer.dtype) edp = EvaluationDataPage.create_from_tdp(tdp, trainer) if accumulated_edp is None: accumulated_edp = edp else: accumulated_edp = accumulated_edp.append(edp) accumulated_edp = accumulated_edp.compute_values(trainer.gamma) cpe_start_time = time.time() details = evaluator.evaluate_post_training(accumulated_edp) details.log() logger.info( "CPE evaluation took {} seconds.".format(time.time() - cpe_start_time) ) through_put = (len(dataset) * int(params["epochs"])) / (time.time() - start_time) logger.info( "Training finished. Processed ~{} examples / s.".format(round(through_put)) ) if writer is not None: writer.close() return export_trainer_and_predictor(trainer, params["model_output_path"])
def evaluate(self, eval_tdp: PreprocessedTrainingBatch): seq2slate_net = self.trainer.seq2slate_net baseline_net = self.trainer.baseline_net seq2slate_net_prev_mode = seq2slate_net.training baseline_net_prev_mode = baseline_net.training seq2slate_net.eval() baseline_net.eval() logged_slate_log_prob = (seq2slate_net( eval_tdp.training_input, mode=Seq2SlateMode.PER_SEQ_LOG_PROB_MODE). log_probs.detach().flatten().cpu().numpy()) b = baseline_net(eval_tdp.training_input).squeeze().detach() advantage = (eval_tdp.training_input.slate_reward - b).flatten().cpu().numpy() self.baseline_loss.append( F.mse_loss(b, eval_tdp.training_input.slate_reward).item()) self.advantages.append(advantage) self.logged_slate_log_probs.append(logged_slate_log_prob) ranked_slate_output = seq2slate_net(eval_tdp.training_input, Seq2SlateMode.RANK_MODE, greedy=True) ranked_slate_prob = (torch.prod( torch.gather( ranked_slate_output.ranked_tgt_out_probs, 2, ranked_slate_output.ranked_tgt_out_idx.unsqueeze(-1), ).squeeze(), -1, ).cpu().numpy()) self.ranked_slate_probs.append(ranked_slate_prob) seq2slate_net.train(seq2slate_net_prev_mode) baseline_net.train(baseline_net_prev_mode) if not self.calc_cpe: return edp_g = EvaluationDataPage.create_from_tensors_seq2slate( seq2slate_net, self.reward_network, eval_tdp.training_input, eval_greedy=True, ) if self.eval_data_pages_g is None: self.eval_data_pages_g = edp_g else: self.eval_data_pages_g = self.eval_data_pages_g.append(edp_g) edp_ng = EvaluationDataPage.create_from_tensors_seq2slate( seq2slate_net, self.reward_network, eval_tdp.training_input, eval_greedy=False, ) if self.eval_data_pages_ng is None: self.eval_data_pages_ng = edp_ng else: self.eval_data_pages_ng = self.eval_data_pages_ng.append(edp_ng)