def __init__( self, q_network, q_network_target, reward_network, parameters: DiscreteActionModelParameters, use_gpu=False, q_network_cpe=None, q_network_cpe_target=None, metrics_to_score=None, imitator=None, env=None, ) -> None: self.env = env DQNTrainer.__init__( self, q_network, q_network_target, reward_network, parameters, use_gpu, q_network_cpe, q_network_cpe_target, metrics_to_score, imitator, )
def test_trainer_maxq(self): env = Env(self.state_dims, self.action_dims) env.seed(42) maxq_parameters = DiscreteActionModelParameters( actions=env.actions, rl=RLParameters( gamma=0.99, target_update_rate=0.9, reward_burnin=100, maxq_learning=True, ), rainbow=RainbowDQNParameters(double_q_learning=True, dueling_architecture=False), training=TrainingParameters( layers=self.layers, activations=self.activations, minibatch_size=self.minibatch_size, learning_rate=0.25, optimizer="ADAM", ), ) maxq_trainer = DQNTrainer(maxq_parameters, env.normalization) logger.info("Generating constant_reward MDPs..") states, actions, rewards, next_states, next_actions, is_terminal, possible_actions, possible_next_actions = env.generate_samples_discrete( self.num_samples) logger.info("Preprocessing constant_reward MDPs..") for epoch in range(self.epochs): tdps = env.preprocess_samples_discrete( states, actions, rewards, next_states, next_actions, is_terminal, possible_actions, possible_next_actions, self.minibatch_size, ) logger.info("Training.. " + str(epoch)) for tdp in tdps: maxq_trainer.train(tdp) logger.info(" ".join([ "Training epoch", str(epoch), "average q values", str(torch.mean(maxq_trainer.all_action_scores)), ])) # Q value should converge to very close to 100 avg_q_value_after_training = torch.mean(maxq_trainer.all_action_scores) self.assertLess(avg_q_value_after_training, 101) self.assertGreater(avg_q_value_after_training, 99)
def get_sarsa_trainer_reward_boost( self, environment, reward_shape, dueling, use_gpu=False, use_all_avail_gpus=False, ): rl_parameters = RLParameters( gamma=DISCOUNT, target_update_rate=1.0, reward_burnin=10, maxq_learning=False, reward_boost=reward_shape, ) training_parameters = TrainingParameters( layers=[-1, 128, -1] if dueling else [-1, -1], activations=["relu", "linear"] if dueling else ["linear"], minibatch_size=self.minibatch_size, learning_rate=0.05, optimizer="ADAM", ) return DQNTrainer( DiscreteActionModelParameters( actions=environment.ACTIONS, rl=rl_parameters, training=training_parameters, rainbow=RainbowDQNParameters(double_q_learning=True, dueling_architecture=dueling), in_training_cpe=InTrainingCPEParameters(mdp_sampled_rate=0.1), ), environment.normalization, use_gpu=use_gpu, use_all_avail_gpus=use_all_avail_gpus, )
def __init__( self, model_params: DiscreteActionModelParameters, preprocess_handler: PreprocessHandler, state_normalization: Dict[int, NormalizationParameters], use_gpu: bool, use_all_avail_gpus: bool, ): logger.info("Running DQN workflow with params:") logger.info(model_params) model_params = model_params trainer = DQNTrainer( model_params, state_normalization, use_gpu=use_gpu, use_all_avail_gpus=use_all_avail_gpus, ) trainer = update_model_for_warm_start(trainer) assert type(trainer) == DQNTrainer, "Warm started wrong model type: " + str( type(trainer) ) evaluator = Evaluator( model_params.actions, model_params.rl.gamma, trainer, metrics_to_score=trainer.metrics_to_score, ) super(DqnWorkflow, self).__init__( preprocess_handler, trainer, evaluator, model_params.training.minibatch_size )
def train_network(params): logger.info("Running DQN workflow with params:") logger.info(params) action_names = np.array(params["actions"]) rl_parameters = RLParameters(**params["rl"]) training_parameters = TrainingParameters(**params["training"]) rainbow_parameters = RainbowDQNParameters(**params["rainbow"]) trainer_params = DiscreteActionModelParameters( actions=params["actions"], rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters, ) dataset = JSONDataset(params["training_data_path"], batch_size=training_parameters.minibatch_size) norm_data = JSONDataset(params["state_norm_data_path"]) state_normalization = read_norm_params(norm_data.read_all()) num_batches = int(len(dataset) / training_parameters.minibatch_size) logger.info("Read in batch data set {} of size {} examples. Data split " "into {} batches of size {}.".format( params["training_data_path"], len(dataset), num_batches, training_parameters.minibatch_size, )) trainer = DQNTrainer(trainer_params, state_normalization, params["use_gpu"]) for epoch in range(params["epochs"]): for batch_idx in range(num_batches): helpers.report_training_status(batch_idx, num_batches, epoch, params["epochs"]) batch = dataset.read_batch(batch_idx) tdp = preprocess_batch_for_training(action_names, batch, state_normalization) trainer.train(tdp) logger.info("Training finished. Saving PyTorch model to {}".format( params["pytorch_output_path"])) helpers.save_model_to_file(trainer, params["pytorch_output_path"])
def get_modular_sarsa_trainer_reward_boost( self, environment, reward_shape, dueling, use_gpu=False, use_all_avail_gpus=False, clip_grad_norm=None, ): parameters = self.get_sarsa_parameters(environment, reward_shape, dueling, clip_grad_norm) q_network = FullyConnectedDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) reward_network = FullyConnectedDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) q_network_cpe = FullyConnectedDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) if use_gpu: q_network = q_network.cuda() reward_network = reward_network.cuda() q_network_cpe = q_network_cpe.cuda() if use_all_avail_gpus: q_network = q_network.get_distributed_data_parallel_model() reward_network = reward_network.get_distributed_data_parallel_model( ) q_network_cpe = q_network_cpe.get_distributed_data_parallel_model( ) q_network_target = q_network.get_target_network() q_network_cpe_target = q_network_cpe.get_target_network() trainer = DQNTrainer( q_network, q_network_target, reward_network, parameters, use_gpu, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, ) return trainer
def build_trainer(self) -> DQNTrainer: net_builder = self.net_builder.value q_network = net_builder.build_q_network( self.state_feature_config, self.state_normalization_parameters, len(self.action_names), ) if self.use_gpu: q_network = q_network.cuda() q_network_target = q_network.get_target_network() reward_network, q_network_cpe, q_network_cpe_target = None, None, None if self.trainer_param.evaluation.calc_cpe_in_training: # Metrics + reward num_output_nodes = (len(self.metrics_to_score) + 1) * len( self.trainer_param.actions ) cpe_net_builder = self.cpe_net_builder.value reward_network = cpe_net_builder.build_q_network( self.state_feature_config, self.state_normalization_parameters, num_output_nodes, ) q_network_cpe = cpe_net_builder.build_q_network( self.state_feature_config, self.state_normalization_parameters, num_output_nodes, ) if self.use_gpu: reward_network.cuda() q_network_cpe.cuda() q_network_cpe_target = q_network_cpe.get_target_network() self._q_network = q_network trainer = DQNTrainer( q_network, q_network_target, reward_network, self.trainer_param, self.use_gpu, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, metrics_to_score=self.metrics_to_score, loss_reporter=NoOpLossReporter(), ) return trainer
def internal_prediction(self, input): """ Only used by Gym """ unmasked_q_values = DQNTrainer.internal_prediction(self, input) masked_q_values = np.zeros(unmasked_q_values.shape) # map Q-values of invalid actions to -infinity instead of zero (supports negative Q-values) if self.action_mask is not None: for i in range(len(self.action_mask)): if self.action_mask[i]: masked_q_values[0][i] = unmasked_q_values[0][i] else: masked_q_values[0][i] = -np.inf else: masked_q_values = unmasked_q_values return masked_q_values
def get_sarsa_trainer_reward_boost( self, environment, reward_shape, dueling, use_gpu=False, use_all_avail_gpus=False, clip_grad_norm=None, ): parameters = self.get_sarsa_parameters(environment, reward_shape, dueling, clip_grad_norm) trainer = DQNTrainer( parameters, environment.normalization, use_gpu=use_gpu, use_all_avail_gpus=use_all_avail_gpus, ) return trainer
def get_sarsa_trainer_reward_boost(self, environment, reward_shape): rl_parameters = RLParameters( gamma=DISCOUNT, target_update_rate=1.0, reward_burnin=10, maxq_learning=False, reward_boost=reward_shape, ) training_parameters = TrainingParameters( layers=[-1, -1], activations=["linear"], minibatch_size=self.minibatch_size, learning_rate=0.25, optimizer="ADAM", ) return DQNTrainer( DiscreteActionModelParameters( actions=environment.ACTIONS, rl=rl_parameters, training=training_parameters, ), environment.normalization, )
def train_network(params): writer = None if params["model_output_path"] is not None: writer = SummaryWriter(log_dir=params["model_output_path"]) logger.info("Running DQN workflow with params:") logger.info(params) # Set minibatch size based on # of devices being used to train params["training"]["minibatch_size"] *= minibatch_size_multiplier( params["use_gpu"], params["use_all_avail_gpus"] ) action_names = np.array(params["actions"]) rl_parameters = RLParameters(**params["rl"]) training_parameters = TrainingParameters(**params["training"]) rainbow_parameters = RainbowDQNParameters(**params["rainbow"]) trainer_params = DiscreteActionModelParameters( actions=params["actions"], rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters, ) dataset = JSONDataset( params["training_data_path"], batch_size=training_parameters.minibatch_size ) eval_dataset = JSONDataset(params["eval_data_path"], batch_size=16) state_normalization = read_norm_file(params["state_norm_data_path"]) num_batches = int(len(dataset) / training_parameters.minibatch_size) logger.info( "Read in batch data set {} of size {} examples. Data split " "into {} batches of size {}.".format( params["training_data_path"], len(dataset), num_batches, training_parameters.minibatch_size, ) ) trainer = DQNTrainer( trainer_params, state_normalization, use_gpu=params["use_gpu"], use_all_avail_gpus=params["use_all_avail_gpus"], ) trainer = update_model_for_warm_start(trainer) preprocessor = Preprocessor(state_normalization, False) evaluator = Evaluator( trainer_params.actions, trainer_params.rl.gamma, trainer, metrics_to_score=trainer.metrics_to_score, ) start_time = time.time() for epoch in range(int(params["epochs"])): dataset.reset_iterator() batch_idx = -1 while True: batch_idx += 1 report_training_status(batch_idx, num_batches, epoch, int(params["epochs"])) batch = dataset.read_batch() if batch is None: break tdp = preprocess_batch_for_training(preprocessor, batch, action_names) tdp.set_type(trainer.dtype) trainer.train(tdp) eval_dataset.reset_iterator() accumulated_edp = None while True: batch = eval_dataset.read_batch() if batch is None: break tdp = preprocess_batch_for_training(preprocessor, batch, action_names) tdp.set_type(trainer.dtype) edp = EvaluationDataPage.create_from_tdp(tdp, trainer) if accumulated_edp is None: accumulated_edp = edp else: accumulated_edp = accumulated_edp.append(edp) accumulated_edp = accumulated_edp.compute_values(trainer.gamma) cpe_start_time = time.time() details = evaluator.evaluate_post_training(accumulated_edp) details.log() logger.info( "CPE evaluation took {} seconds.".format(time.time() - cpe_start_time) ) through_put = (len(dataset) * int(params["epochs"])) / (time.time() - start_time) logger.info( "Training finished. Processed ~{} examples / s.".format(round(through_put)) ) if writer is not None: writer.close() return export_trainer_and_predictor(trainer, params["model_output_path"])
def create_trainer(model_type, params, rl_parameters, use_gpu, env): c2_device = core.DeviceOption(caffe2_pb2.CUDA if use_gpu else caffe2_pb2.CPU) if model_type == ModelType.PYTORCH_DISCRETE_DQN.value: training_parameters = params["training"] if isinstance(training_parameters, dict): training_parameters = TrainingParameters(**training_parameters) rainbow_parameters = params["rainbow"] if isinstance(rainbow_parameters, dict): rainbow_parameters = RainbowDQNParameters(**rainbow_parameters) if env.img: assert ( training_parameters.cnn_parameters is not None ), "Missing CNN parameters for image input" training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels training_parameters.cnn_parameters.input_height = env.height training_parameters.cnn_parameters.input_width = env.width training_parameters.cnn_parameters.num_input_channels = ( env.num_input_channels ) else: assert ( training_parameters.cnn_parameters is None ), "Extra CNN parameters for non-image input" trainer_params = DiscreteActionModelParameters( actions=env.actions, rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters, ) trainer = DQNTrainer(trainer_params, env.normalization, use_gpu) elif model_type == ModelType.DISCRETE_ACTION.value: with core.DeviceScope(c2_device): training_parameters = params["training"] if isinstance(training_parameters, dict): training_parameters = TrainingParameters(**training_parameters) if env.img: assert ( training_parameters.cnn_parameters is not None ), "Missing CNN parameters for image input" training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels training_parameters.cnn_parameters.input_height = env.height training_parameters.cnn_parameters.input_width = env.width training_parameters.cnn_parameters.num_input_channels = ( env.num_input_channels ) else: assert ( training_parameters.cnn_parameters is None ), "Extra CNN parameters for non-image input" trainer_params = DiscreteActionModelParameters( actions=env.actions, rl=rl_parameters, training=training_parameters ) trainer = DiscreteActionTrainer(trainer_params, env.normalization) elif model_type == ModelType.PYTORCH_PARAMETRIC_DQN.value: training_parameters = params["training"] if isinstance(training_parameters, dict): training_parameters = TrainingParameters(**training_parameters) rainbow_parameters = params["rainbow"] if isinstance(rainbow_parameters, dict): rainbow_parameters = RainbowDQNParameters(**rainbow_parameters) if env.img: assert ( training_parameters.cnn_parameters is not None ), "Missing CNN parameters for image input" training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels else: assert ( training_parameters.cnn_parameters is None ), "Extra CNN parameters for non-image input" trainer_params = ContinuousActionModelParameters( rl=rl_parameters, training=training_parameters, knn=KnnParameters(model_type="DQN"), rainbow=rainbow_parameters, ) trainer = ParametricDQNTrainer( trainer_params, env.normalization, env.normalization_action, use_gpu ) elif model_type == ModelType.PARAMETRIC_ACTION.value: with core.DeviceScope(c2_device): training_parameters = params["training"] if isinstance(training_parameters, dict): training_parameters = TrainingParameters(**training_parameters) if env.img: assert ( training_parameters.cnn_parameters is not None ), "Missing CNN parameters for image input" training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels else: assert ( training_parameters.cnn_parameters is None ), "Extra CNN parameters for non-image input" trainer_params = ContinuousActionModelParameters( rl=rl_parameters, training=training_parameters, knn=KnnParameters(model_type="DQN"), ) trainer = ContinuousActionDQNTrainer( trainer_params, env.normalization, env.normalization_action ) elif model_type == ModelType.CONTINUOUS_ACTION.value: training_parameters = params["shared_training"] if isinstance(training_parameters, dict): training_parameters = DDPGTrainingParameters(**training_parameters) actor_parameters = params["actor_training"] if isinstance(actor_parameters, dict): actor_parameters = DDPGNetworkParameters(**actor_parameters) critic_parameters = params["critic_training"] if isinstance(critic_parameters, dict): critic_parameters = DDPGNetworkParameters(**critic_parameters) trainer_params = DDPGModelParameters( rl=rl_parameters, shared_training=training_parameters, actor_training=actor_parameters, critic_training=critic_parameters, ) action_range_low = env.action_space.low.astype(np.float32) action_range_high = env.action_space.high.astype(np.float32) trainer = DDPGTrainer( trainer_params, env.normalization, env.normalization_action, torch.from_numpy(action_range_low).unsqueeze(dim=0), torch.from_numpy(action_range_high).unsqueeze(dim=0), use_gpu, ) else: raise NotImplementedError("Model of type {} not supported".format(model_type)) return trainer
def create_dqn_trainer_from_params( model: DiscreteActionModelParameters, normalization_parameters: Dict[int, NormalizationParameters], use_gpu: bool = False, use_all_avail_gpus: bool = False, metrics_to_score=None, ): metrics_to_score = metrics_to_score or [] if model.rainbow.quantile: q_network = QuantileDQN( state_dim=get_num_output_features(normalization_parameters), action_dim=len(model.actions), num_atoms=model.rainbow.num_atoms, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) elif model.rainbow.categorical: q_network = CategoricalDQN( # type: ignore state_dim=get_num_output_features(normalization_parameters), action_dim=len(model.actions), num_atoms=model.rainbow.num_atoms, qmin=model.rainbow.qmin, qmax=model.rainbow.qmax, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, use_gpu=use_gpu, ) elif model.rainbow.dueling_architecture: q_network = DuelingQNetwork( # type: ignore layers=[get_num_output_features(normalization_parameters)] + model.training.layers[1:-1] + [len(model.actions)], activations=model.training.activations, ) else: q_network = FullyConnectedDQN( # type: ignore state_dim=get_num_output_features(normalization_parameters), action_dim=len(model.actions), sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) if use_gpu and torch.cuda.is_available(): q_network = q_network.cuda() q_network_target = q_network.get_target_network() reward_network, q_network_cpe, q_network_cpe_target = None, None, None if model.evaluation.calc_cpe_in_training: # Metrics + reward num_output_nodes = (len(metrics_to_score) + 1) * len(model.actions) reward_network = FullyConnectedDQN( state_dim=get_num_output_features(normalization_parameters), action_dim=num_output_nodes, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) q_network_cpe = FullyConnectedDQN( state_dim=get_num_output_features(normalization_parameters), action_dim=num_output_nodes, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) if use_gpu and torch.cuda.is_available(): reward_network.cuda() q_network_cpe.cuda() q_network_cpe_target = q_network_cpe.get_target_network() if (use_all_avail_gpus and not model.rainbow.categorical and not model.rainbow.quantile): q_network = q_network.get_distributed_data_parallel_model() reward_network = (reward_network.get_distributed_data_parallel_model() if reward_network else None) q_network_cpe = (q_network_cpe.get_distributed_data_parallel_model() if q_network_cpe else None) if model.rainbow.quantile: assert (not use_all_avail_gpus ), "use_all_avail_gpus not implemented for distributional RL" return QRDQNTrainer( q_network, q_network_target, model, use_gpu, metrics_to_score=metrics_to_score, ) elif model.rainbow.categorical: assert (not use_all_avail_gpus ), "use_all_avail_gpus not implemented for distributional RL" return C51Trainer( q_network, q_network_target, model, use_gpu, metrics_to_score=metrics_to_score, ) else: return DQNTrainer( q_network, q_network_target, reward_network, model, use_gpu, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, metrics_to_score=metrics_to_score, )
def create_from_tensors_dqn( cls, trainer: DQNTrainer, mdp_ids: np.ndarray, sequence_numbers: torch.Tensor, states: rlt.PreprocessedFeatureVector, actions: rlt.PreprocessedFeatureVector, propensities: torch.Tensor, rewards: torch.Tensor, possible_actions_mask: torch.Tensor, metrics: Optional[torch.Tensor] = None, ): old_q_train_state = trainer.q_network.training old_reward_train_state = trainer.reward_network.training old_q_cpe_train_state = trainer.q_network_cpe.training trainer.q_network.train(False) trainer.reward_network.train(False) trainer.q_network_cpe.train(False) num_actions = trainer.num_actions action_mask = actions.float() # type: ignore rewards = trainer.boost_rewards(rewards, actions) # type: ignore model_values = trainer.q_network_cpe( rlt.PreprocessedState(state=states) ).q_values[:, 0:num_actions] optimal_q_values, _ = trainer.get_detached_q_values( states # type: ignore ) eval_action_idxs = trainer.get_max_q_values( # type: ignore optimal_q_values, possible_actions_mask )[1] model_propensities = masked_softmax( optimal_q_values, possible_actions_mask, trainer.rl_temperature ) assert model_values.shape == actions.shape, ( # type: ignore "Invalid shape: " + str(model_values.shape) # type: ignore + " != " + str(actions.shape) # type: ignore ) assert model_values.shape == possible_actions_mask.shape, ( # type: ignore "Invalid shape: " + str(model_values.shape) # type: ignore + " != " + str(possible_actions_mask.shape) # type: ignore ) model_values_for_logged_action = torch.sum( model_values * action_mask, dim=1, keepdim=True ) rewards_and_metric_rewards = trainer.reward_network( rlt.PreprocessedState(state=states) ) # In case we reuse the modular for Q-network if hasattr(rewards_and_metric_rewards, "q_values"): rewards_and_metric_rewards = rewards_and_metric_rewards.q_values model_rewards = rewards_and_metric_rewards[:, 0:num_actions] assert model_rewards.shape == actions.shape, ( # type: ignore "Invalid shape: " + str(model_rewards.shape) # type: ignore + " != " + str(actions.shape) # type: ignore ) model_rewards_for_logged_action = torch.sum( model_rewards * action_mask, dim=1, keepdim=True ) model_metrics = rewards_and_metric_rewards[:, num_actions:] assert model_metrics.shape[1] % num_actions == 0, ( "Invalid metrics shape: " + str(model_metrics.shape) + " " + str(num_actions) ) num_metrics = model_metrics.shape[1] // num_actions if num_metrics == 0: model_metrics_values = None model_metrics_for_logged_action = None model_metrics_values_for_logged_action = None else: model_metrics_values = trainer.q_network_cpe( rlt.PreprocessedState(state=states) ) # Backward compatility if hasattr(model_metrics_values, "q_values"): model_metrics_values = model_metrics_values.q_values model_metrics_values = model_metrics_values[:, num_actions:] assert ( model_metrics_values.shape[1] == num_actions * num_metrics ), ( # type: ignore "Invalid shape: " + str(model_metrics_values.shape[1]) # type: ignore + " != " + str(actions.shape[1] * num_metrics) # type: ignore ) model_metrics_for_logged_action_list = [] model_metrics_values_for_logged_action_list = [] for metric_index in range(num_metrics): metric_start = metric_index * num_actions metric_end = (metric_index + 1) * num_actions model_metrics_for_logged_action_list.append( torch.sum( model_metrics[:, metric_start:metric_end] * action_mask, dim=1, keepdim=True, ) ) model_metrics_values_for_logged_action_list.append( torch.sum( model_metrics_values[:, metric_start:metric_end] * action_mask, dim=1, keepdim=True, ) ) model_metrics_for_logged_action = torch.cat( model_metrics_for_logged_action_list, dim=1 ) model_metrics_values_for_logged_action = torch.cat( model_metrics_values_for_logged_action_list, dim=1 ) trainer.q_network_cpe.train(old_q_cpe_train_state) # type: ignore trainer.q_network.train(old_q_train_state) # type: ignore trainer.reward_network.train(old_reward_train_state) # type: ignore return cls( mdp_id=mdp_ids, sequence_number=sequence_numbers, logged_propensities=propensities, logged_rewards=rewards, action_mask=action_mask, model_rewards=model_rewards, model_rewards_for_logged_action=model_rewards_for_logged_action, model_values=model_values, model_values_for_logged_action=model_values_for_logged_action, model_metrics_values=model_metrics_values, model_metrics_values_for_logged_action=model_metrics_values_for_logged_action, model_propensities=model_propensities, logged_metrics=metrics, model_metrics=model_metrics, model_metrics_for_logged_action=model_metrics_for_logged_action, # Will compute later logged_values=None, logged_metrics_values=None, possible_actions_mask=possible_actions_mask, optimal_q_values=optimal_q_values, eval_action_idxs=eval_action_idxs, )
def create_from_tensors( cls, trainer: DQNTrainer, mdp_ids: np.ndarray, sequence_numbers: torch.Tensor, states: Union[mt.State, torch.Tensor], actions: Union[mt.Action, torch.Tensor], propensities: torch.Tensor, rewards: torch.Tensor, possible_actions_mask: torch.Tensor, possible_actions: Optional[mt.FeatureVector] = None, max_num_actions: Optional[int] = None, metrics: Optional[torch.Tensor] = None, ): # Switch to evaluation mode for the network old_q_train_state = trainer.q_network.training old_reward_train_state = trainer.reward_network.training trainer.q_network.train(False) trainer.reward_network.train(False) if max_num_actions: # Parametric model CPE state_action_pairs = mt.StateAction( # type: ignore state=states, action=actions) tiled_state = mt.FeatureVector( states.float_features.repeat( # type: ignore 1, max_num_actions).reshape( # type: ignore -1, states.float_features.shape[1] # type: ignore )) # Get Q-value of action taken possible_actions_state_concat = mt.StateAction( # type: ignore state=tiled_state, action=possible_actions # type: ignore ) # Parametric actions # FIXME: model_values and model propensities should be calculated # as in discrete dqn model model_values = trainer.q_network( possible_actions_state_concat).q_value # type: ignore optimal_q_values = model_values eval_action_idxs = None assert (model_values.shape[0] * model_values.shape[1] == possible_actions_mask.shape[0] * possible_actions_mask.shape[1]), ( "Invalid shapes: " + str(model_values.shape) + " != " + str(possible_actions_mask.shape)) model_values = model_values.reshape(possible_actions_mask.shape) model_propensities = masked_softmax(model_values, possible_actions_mask, trainer.rl_temperature) model_rewards = trainer.reward_network( possible_actions_state_concat).q_value # type: ignore assert (model_rewards.shape[0] * model_rewards.shape[1] == possible_actions_mask.shape[0] * possible_actions_mask.shape[1]), ( "Invalid shapes: " + str(model_rewards.shape) + " != " + str(possible_actions_mask.shape)) model_rewards = model_rewards.reshape(possible_actions_mask.shape) model_values_for_logged_action = trainer.q_network( state_action_pairs).q_value model_rewards_for_logged_action = trainer.reward_network( state_action_pairs).q_value action_mask = ( torch.abs(model_values - model_values_for_logged_action) < 1e-3).float() model_metrics = None model_metrics_for_logged_action = None model_metrics_values = None model_metrics_values_for_logged_action = None else: if isinstance(states, mt.State): states = mt.StateInput(state=states) # type: ignore num_actions = trainer.num_actions action_mask = actions.float() # type: ignore # Switch to evaluation mode for the network old_q_cpe_train_state = trainer.q_network_cpe.training trainer.q_network_cpe.train(False) # Discrete actions rewards = trainer.boost_rewards(rewards, actions) # type: ignore model_values = trainer.q_network_cpe( states).q_values[:, 0:num_actions] optimal_q_values = trainer.get_detached_q_values( states.state # type: ignore )[ # type: ignore 0] # type: ignore eval_action_idxs = trainer.get_max_q_values( # type: ignore optimal_q_values, possible_actions_mask)[1] model_propensities = masked_softmax(optimal_q_values, possible_actions_mask, trainer.rl_temperature) assert model_values.shape == actions.shape, ( # type: ignore "Invalid shape: " + str(model_values.shape) # type: ignore + " != " + str(actions.shape) # type: ignore ) assert model_values.shape == possible_actions_mask.shape, ( # type: ignore "Invalid shape: " + str(model_values.shape) # type: ignore + " != " + str(possible_actions_mask.shape) # type: ignore ) model_values_for_logged_action = torch.sum(model_values * action_mask, dim=1, keepdim=True) rewards_and_metric_rewards = trainer.reward_network(states) # In case we reuse the modular for Q-network if hasattr(rewards_and_metric_rewards, "q_values"): rewards_and_metric_rewards = rewards_and_metric_rewards.q_values model_rewards = rewards_and_metric_rewards[:, 0:num_actions] assert model_rewards.shape == actions.shape, ( # type: ignore "Invalid shape: " + str(model_rewards.shape) # type: ignore + " != " + str(actions.shape) # type: ignore ) model_rewards_for_logged_action = torch.sum(model_rewards * action_mask, dim=1, keepdim=True) model_metrics = rewards_and_metric_rewards[:, num_actions:] assert model_metrics.shape[1] % num_actions == 0, ( "Invalid metrics shape: " + str(model_metrics.shape) + " " + str(num_actions)) num_metrics = model_metrics.shape[1] // num_actions if num_metrics == 0: model_metrics_values = None model_metrics_for_logged_action = None model_metrics_values_for_logged_action = None else: model_metrics_values = trainer.q_network_cpe(states) # Backward compatility if hasattr(model_metrics_values, "q_values"): model_metrics_values = model_metrics_values.q_values model_metrics_values = model_metrics_values[:, num_actions:] assert (model_metrics_values.shape[1] == num_actions * num_metrics), ( # type: ignore "Invalid shape: " + str(model_metrics_values.shape[1]) # type: ignore + " != " + str(actions.shape[1] * num_metrics) # type: ignore ) model_metrics_for_logged_action_list = [] model_metrics_values_for_logged_action_list = [] for metric_index in range(num_metrics): metric_start = metric_index * num_actions metric_end = (metric_index + 1) * num_actions model_metrics_for_logged_action_list.append( torch.sum( model_metrics[:, metric_start:metric_end] * action_mask, dim=1, keepdim=True, )) model_metrics_values_for_logged_action_list.append( torch.sum( model_metrics_values[:, metric_start:metric_end] * action_mask, dim=1, keepdim=True, )) model_metrics_for_logged_action = torch.cat( model_metrics_for_logged_action_list, dim=1) model_metrics_values_for_logged_action = torch.cat( model_metrics_values_for_logged_action_list, dim=1) # Switch back to the old mode trainer.q_network_cpe.train(old_q_cpe_train_state) # type: ignore # Switch back to the old mode trainer.q_network.train(old_q_train_state) # type: ignore trainer.reward_network.train(old_reward_train_state) # type: ignore return cls( mdp_id=mdp_ids, sequence_number=sequence_numbers, logged_propensities=propensities, logged_rewards=rewards, action_mask=action_mask, model_rewards=model_rewards, model_rewards_for_logged_action=model_rewards_for_logged_action, model_values=model_values, model_values_for_logged_action=model_values_for_logged_action, model_metrics_values=model_metrics_values, model_metrics_values_for_logged_action= model_metrics_values_for_logged_action, model_propensities=model_propensities, logged_metrics=metrics, model_metrics=model_metrics, model_metrics_for_logged_action=model_metrics_for_logged_action, # Will compute later logged_values=None, logged_metrics_values=None, possible_actions_mask=possible_actions_mask, optimal_q_values=optimal_q_values, eval_action_idxs=eval_action_idxs, )
def train_network(params): writer = None if params["model_output_path"] is not None: writer = SummaryWriter(log_dir=params["model_output_path"]) logger.info("Running DQN workflow with params:") logger.info(params) # Set minibatch size based on # of devices being used to train params["training"]["minibatch_size"] *= minibatch_size_multiplier( params["use_gpu"], params["use_all_avail_gpus"]) action_names = np.array(params["actions"]) rl_parameters = RLParameters(**params["rl"]) training_parameters = TrainingParameters(**params["training"]) rainbow_parameters = RainbowDQNParameters(**params["rainbow"]) trainer_params = DiscreteActionModelParameters( actions=params["actions"], rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters, ) dataset = JSONDataset(params["training_data_path"], batch_size=training_parameters.minibatch_size) eval_dataset = JSONDataset(params["eval_data_path"], batch_size=16) state_normalization = read_norm_file(params["state_norm_data_path"]) num_batches = int(len(dataset) / training_parameters.minibatch_size) logger.info("Read in batch data set {} of size {} examples. Data split " "into {} batches of size {}.".format( params["training_data_path"], len(dataset), num_batches, training_parameters.minibatch_size, )) trainer = DQNTrainer( trainer_params, state_normalization, use_gpu=params["use_gpu"], use_all_avail_gpus=params["use_all_avail_gpus"], ) trainer = update_model_for_warm_start(trainer) preprocessor = Preprocessor(state_normalization, False) evaluator = Evaluator( trainer_params.actions, trainer_params.rl.gamma, trainer, metrics_to_score=trainer.metrics_to_score, ) start_time = time.time() for epoch in range(int(params["epochs"])): dataset.reset_iterator() for batch_idx in range(num_batches): report_training_status(batch_idx, num_batches, epoch, int(params["epochs"])) batch = dataset.read_batch(batch_idx) tdp = preprocess_batch_for_training(preprocessor, batch, action_names) tdp.set_type(trainer.dtype) trainer.train(tdp) eval_dataset.reset_iterator() accumulated_edp = None while True: batch = eval_dataset.read_batch(batch_idx) if batch is None: break tdp = preprocess_batch_for_training(preprocessor, batch, action_names) edp = EvaluationDataPage.create_from_tdp(tdp, trainer) if accumulated_edp is None: accumulated_edp = edp else: accumulated_edp = accumulated_edp.append(edp) accumulated_edp = accumulated_edp.compute_values(trainer.gamma) cpe_start_time = time.time() details = evaluator.evaluate_post_training(accumulated_edp) details.log() logger.info("CPE evaluation took {} seconds.".format(time.time() - cpe_start_time)) through_put = (len(dataset) * int(params["epochs"])) / (time.time() - start_time) logger.info("Training finished. Processed ~{} examples / s.".format( round(through_put))) if writer is not None: writer.close() return export_trainer_and_predictor(trainer, params["model_output_path"])
def train_network(params): writer = None if params["model_output_path"] is not None: writer = SummaryWriter( log_dir=os.path.join( os.path.expanduser(params["model_output_path"]), "training_data" ) ) logger.info("Running DQN workflow with params:") logger.info(params) action_names = np.array(params["actions"]) rl_parameters = RLParameters(**params["rl"]) training_parameters = TrainingParameters(**params["training"]) rainbow_parameters = RainbowDQNParameters(**params["rainbow"]) if params["in_training_cpe"] is not None: in_training_cpe_parameters = InTrainingCPEParameters( **params["in_training_cpe"] ) else: in_training_cpe_parameters = None trainer_params = DiscreteActionModelParameters( actions=params["actions"], rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters, in_training_cpe=in_training_cpe_parameters, ) dataset = JSONDataset( params["training_data_path"], batch_size=training_parameters.minibatch_size ) state_normalization = read_norm_file(params["state_norm_data_path"]) num_batches = int(len(dataset) / training_parameters.minibatch_size) logger.info( "Read in batch data set {} of size {} examples. Data split " "into {} batches of size {}.".format( params["training_data_path"], len(dataset), num_batches, training_parameters.minibatch_size, ) ) trainer = DQNTrainer(trainer_params, state_normalization, params["use_gpu"]) trainer = update_model_for_warm_start(trainer) preprocessor = Preprocessor(state_normalization, params["use_gpu"]) if trainer_params.in_training_cpe is not None: evaluator = Evaluator( trainer_params.actions, 10, trainer_params.rl.gamma, trainer, trainer_params.in_training_cpe.mdp_sampled_rate, ) else: evaluator = Evaluator( trainer_params.actions, 10, trainer_params.rl.gamma, trainer, float(DEFAULT_NUM_SAMPLES_FOR_CPE) / len(dataset), ) start_time = time.time() for epoch in range(int(params["epochs"])): for batch_idx in range(num_batches): report_training_status(batch_idx, num_batches, epoch, int(params["epochs"])) batch = dataset.read_batch(batch_idx) tdp = preprocess_batch_for_training(preprocessor, batch, action_names) trainer.train(tdp) trainer.evaluate( evaluator, tdp.actions, None, tdp.rewards, tdp.episode_values ) evaluator.collect_discrete_action_samples( mdp_ids=tdp.mdp_ids, sequence_numbers=tdp.sequence_numbers.cpu().numpy(), states=tdp.states.cpu().numpy(), logged_actions=tdp.actions.cpu().numpy(), logged_rewards=tdp.rewards.cpu().numpy(), logged_propensities=tdp.propensities.cpu().numpy(), logged_terminals=np.invert( tdp.not_terminals.cpu().numpy().astype(np.bool) ), ) cpe_start_time = time.time() evaluator.recover_samples_to_be_unshuffled() evaluator.score_cpe() if writer is not None: evaluator.log_to_tensorboard(writer, epoch) evaluator.clear_collected_samples() logger.info( "CPE evaluation took {} seconds.".format(time.time() - cpe_start_time) ) through_put = (len(dataset) * int(params["epochs"])) / (time.time() - start_time) logger.info( "Training finished. Processed ~{} examples / s.".format(round(through_put)) ) if writer is not None: writer.close() return export_trainer_and_predictor(trainer, params["model_output_path"])
def get_modular_sarsa_trainer_reward_boost( self, environment, reward_shape, dueling, categorical, quantile, use_gpu=False, use_all_avail_gpus=False, clip_grad_norm=None, ): assert not quantile or not categorical parameters = self.get_sarsa_parameters(environment, reward_shape, dueling, categorical, quantile, clip_grad_norm) if quantile: if dueling: q_network = DuelingQuantileDQN( layers=[ get_num_output_features(environment.normalization) ] + parameters.training.layers[1:-1] + [len(environment.ACTIONS)], activations=parameters.training.activations, num_atoms=parameters.rainbow.num_atoms, ) else: q_network = QuantileDQN( state_dim=get_num_output_features( environment.normalization), action_dim=len(environment.ACTIONS), num_atoms=parameters.rainbow.num_atoms, sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) elif categorical: assert not dueling q_network = CategoricalDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), num_atoms=parameters.rainbow.num_atoms, qmin=-100, qmax=200, sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) else: if dueling: q_network = DuelingQNetwork( layers=[ get_num_output_features(environment.normalization) ] + parameters.training.layers[1:-1] + [len(environment.ACTIONS)], activations=parameters.training.activations, ) else: q_network = FullyConnectedDQN( state_dim=get_num_output_features( environment.normalization), action_dim=len(environment.ACTIONS), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) q_network_cpe, q_network_cpe_target, reward_network = None, None, None if parameters.evaluation and parameters.evaluation.calc_cpe_in_training: q_network_cpe = FullyConnectedDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) q_network_cpe_target = q_network_cpe.get_target_network() reward_network = FullyConnectedDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) if use_gpu: q_network = q_network.cuda() if parameters.evaluation.calc_cpe_in_training: reward_network = reward_network.cuda() q_network_cpe = q_network_cpe.cuda() q_network_cpe_target = q_network_cpe_target.cuda() if use_all_avail_gpus and not categorical: q_network = q_network.get_distributed_data_parallel_model() reward_network = reward_network.get_distributed_data_parallel_model( ) q_network_cpe = q_network_cpe.get_distributed_data_parallel_model( ) q_network_cpe_target = ( q_network_cpe_target.get_distributed_data_parallel_model()) if quantile: trainer = QRDQNTrainer( q_network, q_network.get_target_network(), parameters, use_gpu, reward_network=reward_network, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, ) elif categorical: trainer = C51Trainer(q_network, q_network.get_target_network(), parameters, use_gpu) else: parameters = DQNTrainerParameters.from_discrete_action_model_parameters( parameters) trainer = DQNTrainer( q_network, q_network.get_target_network(), reward_network, parameters, use_gpu, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, ) return trainer
def run_gym( params, score_bar, gpu_id, save_timesteps_to_dataset=None, start_saving_from_episode=0, batch_rl_file_path=None, ): # Caffe2 core uses the min of caffe2_log_level and minloglevel # to determine loglevel. See caffe2/caffe2/core/logging.cc for more info. core.GlobalInit(["caffe2", "--caffe2_log_level=2", "--minloglevel=2"]) logger.info("Running gym with params") logger.info(params) rl_parameters = RLParameters(**params["rl"]) env_type = params["env"] env = OpenAIGymEnvironment( env_type, rl_parameters.epsilon, rl_parameters.softmax_policy, params["max_replay_memory_size"], rl_parameters.gamma, ) model_type = params["model_type"] c2_device = core.DeviceOption( caffe2_pb2.CPU if gpu_id == USE_CPU else caffe2_pb2.CUDA, gpu_id) use_gpu = gpu_id != USE_CPU if model_type == ModelType.PYTORCH_DISCRETE_DQN.value: training_settings = params["training"] training_parameters = TrainingParameters(**training_settings) if env.img: assert (training_parameters.cnn_parameters is not None), "Missing CNN parameters for image input" training_parameters.cnn_parameters = CNNParameters( **training_settings["cnn_parameters"]) training_parameters.cnn_parameters.conv_dims[ 0] = env.num_input_channels training_parameters.cnn_parameters.input_height = env.height training_parameters.cnn_parameters.input_width = env.width training_parameters.cnn_parameters.num_input_channels = ( env.num_input_channels) else: assert (training_parameters.cnn_parameters is None), "Extra CNN parameters for non-image input" trainer_params = DiscreteActionModelParameters( actions=env.actions, rl=rl_parameters, training=training_parameters) trainer = DQNTrainer(trainer_params, env.normalization, use_gpu) elif model_type == ModelType.DISCRETE_ACTION.value: with core.DeviceScope(c2_device): training_settings = params["training"] training_parameters = TrainingParameters(**training_settings) if env.img: assert (training_parameters.cnn_parameters is not None), "Missing CNN parameters for image input" training_parameters.cnn_parameters = CNNParameters( **training_settings["cnn_parameters"]) training_parameters.cnn_parameters.conv_dims[ 0] = env.num_input_channels training_parameters.cnn_parameters.input_height = env.height training_parameters.cnn_parameters.input_width = env.width training_parameters.cnn_parameters.num_input_channels = ( env.num_input_channels) else: assert (training_parameters.cnn_parameters is None), "Extra CNN parameters for non-image input" trainer_params = DiscreteActionModelParameters( actions=env.actions, rl=rl_parameters, training=training_parameters) trainer = DiscreteActionTrainer(trainer_params, env.normalization) elif model_type == ModelType.PYTORCH_PARAMETRIC_DQN.value: training_settings = params["training"] training_parameters = TrainingParameters(**training_settings) if env.img: assert (training_parameters.cnn_parameters is not None), "Missing CNN parameters for image input" training_parameters.cnn_parameters = CNNParameters( **training_settings["cnn_parameters"]) training_parameters.cnn_parameters.conv_dims[ 0] = env.num_input_channels else: assert (training_parameters.cnn_parameters is None), "Extra CNN parameters for non-image input" trainer_params = ContinuousActionModelParameters( rl=rl_parameters, training=training_parameters, knn=KnnParameters(model_type="DQN"), ) trainer = ParametricDQNTrainer(trainer_params, env.normalization, env.normalization_action, use_gpu) elif model_type == ModelType.PARAMETRIC_ACTION.value: with core.DeviceScope(c2_device): training_settings = params["training"] training_parameters = TrainingParameters(**training_settings) if env.img: assert (training_parameters.cnn_parameters is not None), "Missing CNN parameters for image input" training_parameters.cnn_parameters = CNNParameters( **training_settings["cnn_parameters"]) training_parameters.cnn_parameters.conv_dims[ 0] = env.num_input_channels else: assert (training_parameters.cnn_parameters is None), "Extra CNN parameters for non-image input" trainer_params = ContinuousActionModelParameters( rl=rl_parameters, training=training_parameters, knn=KnnParameters(model_type="DQN"), ) trainer = ContinuousActionDQNTrainer(trainer_params, env.normalization, env.normalization_action) elif model_type == ModelType.CONTINUOUS_ACTION.value: training_settings = params["shared_training"] actor_settings = params["actor_training"] critic_settings = params["critic_training"] trainer_params = DDPGModelParameters( rl=rl_parameters, shared_training=DDPGTrainingParameters(**training_settings), actor_training=DDPGNetworkParameters(**actor_settings), critic_training=DDPGNetworkParameters(**critic_settings), ) action_range_low = env.action_space.low.astype(np.float32) action_range_high = env.action_space.high.astype(np.float32) trainer = DDPGTrainer( trainer_params, env.normalization, env.normalization_action, torch.from_numpy(action_range_low).unsqueeze(dim=0), torch.from_numpy(action_range_high).unsqueeze(dim=0), use_gpu, ) else: raise NotImplementedError( "Model of type {} not supported".format(model_type)) return run( c2_device, env, model_type, trainer, "{} test run".format(env_type), score_bar, **params["run_details"], save_timesteps_to_dataset=save_timesteps_to_dataset, start_saving_from_episode=start_saving_from_episode, batch_rl_file_path=batch_rl_file_path, )
def get_modular_sarsa_trainer_reward_boost( self, environment, reward_shape, dueling, categorical, quantile, use_gpu=False, use_all_avail_gpus=False, clip_grad_norm=None, ): parameters = self.get_sarsa_parameters(environment, reward_shape, dueling, categorical, quantile, clip_grad_norm) if quantile: q_network = QuantileDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), num_atoms=50, sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) parameters.rainbow.num_atoms = 50 elif categorical: q_network = CategoricalDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), num_atoms=51, qmin=-100, qmax=200, sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) else: q_network = FullyConnectedDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) q_network_cpe = FullyConnectedDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) reward_network = FullyConnectedDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) if use_gpu: q_network = q_network.cuda() if not categorical and not quantile: reward_network = reward_network.cuda() q_network_cpe = q_network_cpe.cuda() if use_all_avail_gpus and not categorical: q_network = q_network.get_distributed_data_parallel_model() reward_network = reward_network.get_distributed_data_parallel_model( ) q_network_cpe = q_network_cpe.get_distributed_data_parallel_model( ) if quantile: trainer = QRDQNTrainer(q_network, q_network.get_target_network(), parameters, use_gpu) elif categorical: trainer = C51Trainer(q_network, q_network.get_target_network(), parameters, use_gpu) else: trainer = DQNTrainer( q_network, q_network.get_target_network(), reward_network, parameters, use_gpu, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe.get_target_network(), ) return trainer
def create_dqn_trainer_from_params( model: DiscreteActionModelParameters, normalization_parameters: Dict[int, NormalizationParameters], use_gpu: bool = False, metrics_to_score=None, ): metrics_to_score = metrics_to_score or [] if model.rainbow.dueling_architecture: q_network = DuelingQNetwork( layers=[get_num_output_features(normalization_parameters)] + model.training.layers[1:-1] + [len(model.actions)], activations=model.training.activations, ) else: q_network = FullyConnectedDQN( state_dim=get_num_output_features(normalization_parameters), action_dim=len(model.actions), sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) if use_gpu and torch.cuda.is_available(): q_network = q_network.cuda() q_network_target = q_network.get_target_network() reward_network, q_network_cpe, q_network_cpe_target = None, None, None if model.evaluation.calc_cpe_in_training: # Metrics + reward num_output_nodes = (len(metrics_to_score) + 1) * len(model.actions) reward_network = FullyConnectedDQN( state_dim=get_num_output_features(normalization_parameters), action_dim=num_output_nodes, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) q_network_cpe = FullyConnectedDQN( state_dim=get_num_output_features(normalization_parameters), action_dim=num_output_nodes, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) if use_gpu and torch.cuda.is_available(): reward_network.cuda() q_network_cpe.cuda() q_network_cpe_target = q_network_cpe.get_target_network() return DQNTrainer( q_network, q_network_target, reward_network, model, use_gpu, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, metrics_to_score=metrics_to_score, )
def create_trainer(model_type, params, rl_parameters, use_gpu, env): if model_type == ModelType.PYTORCH_DISCRETE_DQN.value: training_parameters = params["training"] if isinstance(training_parameters, dict): training_parameters = TrainingParameters(**training_parameters) rainbow_parameters = params["rainbow"] if isinstance(rainbow_parameters, dict): rainbow_parameters = RainbowDQNParameters(**rainbow_parameters) if env.img: assert (training_parameters.cnn_parameters is not None), "Missing CNN parameters for image input" if isinstance(training_parameters.cnn_parameters, dict): training_parameters.cnn_parameters = CNNParameters( **training_parameters.cnn_parameters) training_parameters.cnn_parameters.conv_dims[ 0] = env.num_input_channels training_parameters.cnn_parameters.input_height = env.height training_parameters.cnn_parameters.input_width = env.width training_parameters.cnn_parameters.num_input_channels = ( env.num_input_channels) else: assert (training_parameters.cnn_parameters is None), "Extra CNN parameters for non-image input" trainer_params = DiscreteActionModelParameters( actions=env.actions, rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters, ) trainer = DQNTrainer(trainer_params, env.normalization, use_gpu) elif model_type == ModelType.PYTORCH_PARAMETRIC_DQN.value: training_parameters = params["training"] if isinstance(training_parameters, dict): training_parameters = TrainingParameters(**training_parameters) rainbow_parameters = params["rainbow"] if isinstance(rainbow_parameters, dict): rainbow_parameters = RainbowDQNParameters(**rainbow_parameters) if env.img: assert (training_parameters.cnn_parameters is not None), "Missing CNN parameters for image input" training_parameters.cnn_parameters.conv_dims[ 0] = env.num_input_channels else: assert (training_parameters.cnn_parameters is None), "Extra CNN parameters for non-image input" trainer_params = ContinuousActionModelParameters( rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters) trainer = ParametricDQNTrainer(trainer_params, env.normalization, env.normalization_action, use_gpu) elif model_type == ModelType.CONTINUOUS_ACTION.value: training_parameters = params["shared_training"] if isinstance(training_parameters, dict): training_parameters = DDPGTrainingParameters(**training_parameters) actor_parameters = params["actor_training"] if isinstance(actor_parameters, dict): actor_parameters = DDPGNetworkParameters(**actor_parameters) critic_parameters = params["critic_training"] if isinstance(critic_parameters, dict): critic_parameters = DDPGNetworkParameters(**critic_parameters) trainer_params = DDPGModelParameters( rl=rl_parameters, shared_training=training_parameters, actor_training=actor_parameters, critic_training=critic_parameters, ) action_range_low = env.action_space.low.astype(np.float32) action_range_high = env.action_space.high.astype(np.float32) trainer = DDPGTrainer( trainer_params, env.normalization, env.normalization_action, torch.from_numpy(action_range_low).unsqueeze(dim=0), torch.from_numpy(action_range_high).unsqueeze(dim=0), use_gpu, ) elif model_type == ModelType.SOFT_ACTOR_CRITIC.value: trainer_params = SACModelParameters( rl=rl_parameters, training=SACTrainingParameters( minibatch_size=params["sac_training"]["minibatch_size"], use_2_q_functions=params["sac_training"]["use_2_q_functions"], q_network_optimizer=OptimizerParameters( **params["sac_training"]["q_network_optimizer"]), value_network_optimizer=OptimizerParameters( **params["sac_training"]["value_network_optimizer"]), actor_network_optimizer=OptimizerParameters( **params["sac_training"]["actor_network_optimizer"]), entropy_temperature=params["sac_training"] ["entropy_temperature"], ), q_network=FeedForwardParameters(**params["sac_q_training"]), value_network=FeedForwardParameters( **params["sac_value_training"]), actor_network=FeedForwardParameters( **params["sac_actor_training"]), ) trainer = get_sac_trainer(env, trainer_params, use_gpu) else: raise NotImplementedError( "Model of type {} not supported".format(model_type)) return trainer
def test_trainer_maxq(self): env = Env(self.state_dims, self.action_dims) env.seed(42) maxq_parameters = DiscreteActionModelParameters( actions=env.actions, rl=RLParameters( gamma=0.99, target_update_rate=0.9, reward_burnin=100, maxq_learning=True, ), rainbow=RainbowDQNParameters( double_q_learning=True, dueling_architecture=False ), training=TrainingParameters( layers=self.layers, activations=self.activations, minibatch_size=self.minibatch_size, learning_rate=0.25, optimizer="ADAM", ), ) maxq_trainer = DQNTrainer(maxq_parameters, env.normalization) logger.info("Generating constant_reward MDPs..") states, actions, rewards, next_states, next_actions, is_terminal, possible_actions, possible_next_actions = env.generate_samples_discrete( self.num_samples ) logger.info("Preprocessing constant_reward MDPs..") for epoch in range(self.epochs): tdps = env.preprocess_samples_discrete( states, actions, rewards, next_states, next_actions, is_terminal, possible_actions, possible_next_actions, self.minibatch_size, ) logger.info("Training.. " + str(epoch)) for tdp in tdps: maxq_trainer.train(tdp) logger.info( " ".join( [ "Training epoch", str(epoch), "average q values", str(torch.mean(maxq_trainer.all_action_scores)), ] ) ) # Q value should converge to very close to 100 avg_q_value_after_training = torch.mean(maxq_trainer.all_action_scores) self.assertLess(avg_q_value_after_training, 101) self.assertGreater(avg_q_value_after_training, 99)