def get_trainer( self, environment, parameters=None, use_gpu=False, use_all_avail_gpus=False ): parameters = parameters or self.get_sarsa_parameters() q_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(environment.normalization), action_dim=get_num_output_features(environment.normalization_action), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) reward_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(environment.normalization), action_dim=get_num_output_features(environment.normalization_action), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) if use_gpu: q_network = q_network.cuda() reward_network = reward_network.cuda() if use_all_avail_gpus: q_network = q_network.get_distributed_data_parallel_model() reward_network = reward_network.get_distributed_data_parallel_model() q_network_target = q_network.get_target_network() trainer = ParametricDQNTrainer( q_network, q_network_target, reward_network, parameters ) return trainer
def __init__( self, parameters: ContinuousActionModelParameters, state_normalization_parameters: Dict[int, NormalizationParameters], action_normalization_parameters: Dict[int, NormalizationParameters], additional_feature_types: AdditionalFeatureTypes = DEFAULT_ADDITIONAL_FEATURE_TYPES, ) -> None: self._additional_feature_types = additional_feature_types self.state_normalization_parameters = state_normalization_parameters self.action_normalization_parameters = action_normalization_parameters num_features = get_num_output_features( state_normalization_parameters) + get_num_output_features( action_normalization_parameters) # ensure state and action IDs have no intersection overlapping_features = set( state_normalization_parameters.keys()) & set( action_normalization_parameters.keys()) assert len(overlapping_features) == 0, ( "There are some overlapping state and action features: " + str(overlapping_features)) parameters.training.layers[0] = num_features parameters.training.layers[-1] = 1 RLTrainer.__init__(self, parameters) self._create_internal_policy_net()
def get_trainer(self, environment, parameters=None, use_gpu=False, use_all_avail_gpus=False): layers = [256, 128] activations = ["relu", "relu"] parameters = parameters or self.get_sarsa_parameters() q_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(environment.normalization), action_dim=get_num_output_features( environment.normalization_action), sizes=layers, activations=activations, ) reward_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(environment.normalization), action_dim=get_num_output_features( environment.normalization_action), sizes=layers, activations=activations, ) if use_gpu: q_network = q_network.cuda() reward_network = reward_network.cuda() if use_all_avail_gpus: q_network = q_network.get_distributed_data_parallel_model() reward_network = reward_network.get_distributed_data_parallel_model( ) q_network_target = q_network.get_target_network() param_dict = parameters.asdict() # type: ignore trainer = ParametricDQNTrainer(q_network, q_network_target, reward_network, **param_dict) return trainer
def _get_sac_trainer_params(env, sac_model_params, use_gpu): state_dim = get_num_output_features(env.normalization) action_dim = get_num_output_features(env.normalization_action) q1_network = FullyConnectedParametricDQN( state_dim, action_dim, sac_model_params.q_network.layers, sac_model_params.q_network.activations, ) q2_network = None if sac_model_params.training.use_2_q_functions: q2_network = FullyConnectedParametricDQN( state_dim, action_dim, sac_model_params.q_network.layers, sac_model_params.q_network.activations, ) value_network = FullyConnectedNetwork( [state_dim] + sac_model_params.value_network.layers + [1], sac_model_params.value_network.activations + ["linear"], ) actor_network = GaussianFullyConnectedActor( state_dim, action_dim, sac_model_params.actor_network.layers, sac_model_params.actor_network.activations, ) if use_gpu: q1_network.cuda() if q2_network: q2_network.cuda() value_network.cuda() actor_network.cuda() value_network_target = deepcopy(value_network) min_action_range_tensor_training = torch.full((1, action_dim), -1 + 1e-6) max_action_range_tensor_training = torch.full((1, action_dim), 1 - 1e-6) action_range_low = env.action_space.low.astype(np.float32) action_range_high = env.action_space.high.astype(np.float32) min_action_range_tensor_serving = torch.from_numpy(action_range_low).unsqueeze( dim=0 ) max_action_range_tensor_serving = torch.from_numpy(action_range_high).unsqueeze( dim=0 ) trainer_args = [ q1_network, value_network, value_network_target, actor_network, sac_model_params, ] trainer_kwargs = { "q2_network": q2_network, "min_action_range_tensor_training": min_action_range_tensor_training, "max_action_range_tensor_training": max_action_range_tensor_training, "min_action_range_tensor_serving": min_action_range_tensor_serving, "max_action_range_tensor_serving": max_action_range_tensor_serving, } return trainer_args, trainer_kwargs
def _get_sac_trainer_params(env, sac_model_params, use_gpu): state_dim = get_num_output_features(env.normalization) action_dim = get_num_output_features(env.normalization_action) q1_network = FullyConnectedParametricDQN( state_dim, action_dim, sac_model_params.q_network.layers, sac_model_params.q_network.activations, ) q2_network = None if sac_model_params.training.use_2_q_functions: q2_network = FullyConnectedParametricDQN( state_dim, action_dim, sac_model_params.q_network.layers, sac_model_params.q_network.activations, ) value_network = FullyConnectedNetwork( [state_dim] + sac_model_params.value_network.layers + [1], sac_model_params.value_network.activations + ["linear"], ) actor_network = GaussianFullyConnectedActor( state_dim, action_dim, sac_model_params.actor_network.layers, sac_model_params.actor_network.activations, ) if use_gpu: q1_network.cuda() if q2_network: q2_network.cuda() value_network.cuda() actor_network.cuda() value_network_target = deepcopy(value_network) min_action_range_tensor_training = torch.full((1, action_dim), -1 + 1e-6) max_action_range_tensor_training = torch.full((1, action_dim), 1 - 1e-6) action_range_low = env.action_space.low.astype(np.float32) action_range_high = env.action_space.high.astype(np.float32) min_action_range_tensor_serving = torch.from_numpy(action_range_low).unsqueeze( dim=0 ) max_action_range_tensor_serving = torch.from_numpy(action_range_high).unsqueeze( dim=0 ) trainer_args = [ q1_network, value_network, value_network_target, actor_network, sac_model_params, ] trainer_kwargs = { "q2_network": q2_network, "min_action_range_tensor_training": min_action_range_tensor_training, "max_action_range_tensor_training": max_action_range_tensor_training, "min_action_range_tensor_serving": min_action_range_tensor_serving, "max_action_range_tensor_serving": max_action_range_tensor_serving, } return trainer_args, trainer_kwargs
def __init__(self, state_normalization_parameters: Dict[str, NormalizationParameters], action_normalization_parameters: Dict[ str, NormalizationParameters], parameters: ContinuousActionModelParameters, skip_normalization: Optional[bool] = False) -> None: self._action_features = list(action_normalization_parameters.keys()) self.num_unprocessed_action_features = len(self._action_features) self.num_processed_action_features = get_num_output_features( action_normalization_parameters) self.num_processed_state_features = get_num_output_features( state_normalization_parameters) if parameters.training.layers[0] is None or\ parameters.training.layers[0] == -1: parameters.training.layers[0] = self.num_state_features +\ self.num_action_features assert parameters.training.layers[-1] == 1, "Set layers[-1] to 1" self._action_normalization_parameters = action_normalization_parameters RLTrainer.__init__(self, state_normalization_parameters, parameters, skip_normalization) print(action_normalization_parameters) self._prepare_action_normalization()
def create_parametric_dqn_trainer_from_params( model: ContinuousActionModelParameters, state_normalization_parameters: Dict[int, NormalizationParameters], action_normalization_parameters: Dict[int, NormalizationParameters], use_gpu: bool = False, use_all_avail_gpus: bool = False, ): q_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(state_normalization_parameters), action_dim=get_num_output_features(action_normalization_parameters), sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], ) reward_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(state_normalization_parameters), action_dim=get_num_output_features(action_normalization_parameters), sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], ) q_network_target = q_network.get_target_network() if use_gpu and torch.cuda.is_available(): q_network = q_network.cuda() q_network_target = q_network_target.cuda() reward_network = reward_network.cuda() if use_all_avail_gpus: q_network = q_network.get_distributed_data_parallel_model() q_network_target = q_network_target.get_distributed_data_parallel_model( ) reward_network = reward_network.get_distributed_data_parallel_model() return ParametricDQNTrainer(q_network, q_network_target, reward_network, model, use_gpu)
def get_modular_sarsa_trainer_reward_boost( self, environment, reward_shape, dueling, use_gpu=False, use_all_avail_gpus=False, clip_grad_norm=None, ): parameters = self.get_sarsa_parameters(environment, reward_shape, dueling, clip_grad_norm) q_network = FullyConnectedDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) reward_network = FullyConnectedDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) if use_gpu: q_network = q_network.cuda() reward_network = reward_network.cuda() if use_all_avail_gpus: q_network = q_network.get_data_parallel_model() reward_network = reward_network.get_data_parallel_model() q_network_target = q_network.get_target_network() trainer = _DQNTrainer(q_network, q_network_target, reward_network, parameters, use_gpu) return trainer
def get_sac_trainer(self, env, parameters, use_gpu): state_dim = get_num_output_features(env.normalization) action_dim = get_num_output_features( env.normalization_continuous_action) q1_network = FullyConnectedParametricDQN( state_dim, action_dim, parameters.q_network.layers, parameters.q_network.activations, ) q2_network = None if parameters.training.use_2_q_functions: q2_network = FullyConnectedParametricDQN( state_dim, action_dim, parameters.q_network.layers, parameters.q_network.activations, ) if parameters.constrain_action_sum: actor_network = DirichletFullyConnectedActor( state_dim, action_dim, parameters.actor_network.layers, parameters.actor_network.activations, ) else: actor_network = GaussianFullyConnectedActor( state_dim, action_dim, parameters.actor_network.layers, parameters.actor_network.activations, ) value_network = None if parameters.training.use_value_network: value_network = FullyConnectedNetwork( [state_dim] + parameters.value_network.layers + [1], parameters.value_network.activations + ["linear"], ) if use_gpu: q1_network.cuda() if q2_network: q2_network.cuda() if value_network: value_network.cuda() actor_network.cuda() return SACTrainer( q1_network, actor_network, parameters, use_gpu=use_gpu, value_network=value_network, q2_network=q2_network, )
def __init__( self, parameters: ContinuousActionModelParameters, state_normalization_parameters: Dict[int, NormalizationParameters], action_normalization_parameters: Dict[int, NormalizationParameters], use_gpu=False, additional_feature_types: AdditionalFeatureTypes = DEFAULT_ADDITIONAL_FEATURE_TYPES, ) -> None: self.warm_start_model_path = parameters.training.warm_start_model_path self.minibatch_size = parameters.training.minibatch_size self.state_normalization_parameters = state_normalization_parameters self.action_normalization_parameters = action_normalization_parameters self.num_state_features = get_num_output_features( state_normalization_parameters) self.num_action_features = get_num_output_features( action_normalization_parameters) self.num_features = self.num_state_features + self.num_action_features # ensure state and action IDs have no intersection overlapping_features = set( state_normalization_parameters.keys()) & set( action_normalization_parameters.keys()) assert len(overlapping_features) == 0, ( "There are some overlapping state and action features: " + str(overlapping_features)) if parameters.training.factorization_parameters is None: parameters.training.layers[0] = self.num_features parameters.training.layers[-1] = 1 else: parameters.training.factorization_parameters.state.layers[ 0] = self.num_state_features parameters.training.factorization_parameters.action.layers[ 0] = self.num_action_features RLTrainer.__init__(self, parameters, use_gpu, additional_feature_types, None) self.q_network = self._get_model(parameters.training) self.q_network_target = deepcopy(self.q_network) self._set_optimizer(parameters.training.optimizer) self.q_network_optimizer = self.optimizer_func( self.q_network.parameters(), lr=parameters.training.learning_rate) self.reward_network = self._get_model(parameters.training) self.reward_network_optimizer = self.optimizer_func( self.reward_network.parameters(), lr=parameters.training.learning_rate) if self.use_gpu: self.q_network.cuda() self.q_network_target.cuda() self.reward_network.cuda()
def get_td3_trainer(env, parameters, use_gpu): state_dim = get_num_output_features(env.normalization) action_dim = get_num_output_features(env.normalization_action) q1_network = FullyConnectedParametricDQN( state_dim, action_dim, parameters.q_network.layers, parameters.q_network.activations, ) q2_network = None if parameters.training.use_2_q_functions: q2_network = FullyConnectedParametricDQN( state_dim, action_dim, parameters.q_network.layers, parameters.q_network.activations, ) actor_network = FullyConnectedActor( state_dim, action_dim, parameters.actor_network.layers, parameters.actor_network.activations, ) min_action_range_tensor_training = torch.full((1, action_dim), -1) max_action_range_tensor_training = torch.full((1, action_dim), 1) min_action_range_tensor_serving = torch.FloatTensor( env.action_space.low).unsqueeze(dim=0) max_action_range_tensor_serving = torch.FloatTensor( env.action_space.high).unsqueeze(dim=0) if use_gpu: q1_network.cuda() if q2_network: q2_network.cuda() actor_network.cuda() min_action_range_tensor_training = min_action_range_tensor_training.cuda( ) max_action_range_tensor_training = max_action_range_tensor_training.cuda( ) min_action_range_tensor_serving = min_action_range_tensor_serving.cuda( ) max_action_range_tensor_serving = max_action_range_tensor_serving.cuda( ) trainer_args = [q1_network, actor_network, parameters] trainer_kwargs = { "q2_network": q2_network, "min_action_range_tensor_training": min_action_range_tensor_training, "max_action_range_tensor_training": max_action_range_tensor_training, "min_action_range_tensor_serving": min_action_range_tensor_serving, "max_action_range_tensor_serving": max_action_range_tensor_serving, } return TD3Trainer(*trainer_args, use_gpu=use_gpu, **trainer_kwargs)
def __init__( self, state_preprocessor: Preprocessor, action_preprocessor: Preprocessor, seq_len: int, ): super().__init__(state_preprocessor, action_preprocessor) self.state_dim = get_num_output_features( state_preprocessor.normalization_parameters) self.action_dim = get_num_output_features( action_preprocessor.normalization_parameters) self.seq_len = seq_len
def create_parametric_dqn_trainer_from_params( model: ContinuousActionModelParameters, state_normalization_parameters: Dict[int, NormalizationParameters], action_normalization_parameters: Dict[int, NormalizationParameters], use_gpu: bool = False, use_all_avail_gpus: bool = False, ): q_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(state_normalization_parameters), action_dim=get_num_output_features(action_normalization_parameters), sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], ) reward_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(state_normalization_parameters), action_dim=get_num_output_features(action_normalization_parameters), sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], ) q_network_target = q_network.get_target_network() if use_gpu: q_network = q_network.cuda() q_network_target = q_network_target.cuda() reward_network = reward_network.cuda() if use_all_avail_gpus: q_network = q_network.get_distributed_data_parallel_model() q_network_target = q_network_target.get_distributed_data_parallel_model( ) reward_network = reward_network.get_distributed_data_parallel_model() trainer_parameters = ParametricDQNTrainerParameters( # type: ignore rl=model.rl, double_q_learning=model.rainbow.double_q_learning, minibatch_size=model.training.minibatch_size, optimizer=OptimizerParameters( optimizer=model.training.optimizer, learning_rate=model.training.learning_rate, l2_decay=model.training.l2_decay, ), ) return ParametricDQNTrainer( q_network, q_network_target, reward_network, use_gpu=use_gpu, **trainer_parameters.asdict() # type: ignore )
def __init__( self, parameters: DiscreteActionModelParameters, normalization_parameters: Dict[int, NormalizationParameters], additional_feature_types: AdditionalFeatureTypes = DEFAULT_ADDITIONAL_FEATURE_TYPES, ) -> None: self._additional_feature_types = additional_feature_types self._actions = parameters.actions if parameters.actions is not None else [] self.reward_shape = {} # type: Dict[int, float] if parameters.rl.reward_boost is not None and self._actions is not None: for k in parameters.rl.reward_boost.keys(): i = self._actions.index(k) self.reward_shape[i] = parameters.rl.reward_boost[k] if parameters.training.cnn_parameters is None: self.state_normalization_parameters: Optional[Dict[ int, NormalizationParameters]] = normalization_parameters num_features = get_num_output_features(normalization_parameters) parameters.training.layers[0] = num_features else: self.state_normalization_parameters = None parameters.training.layers[-1] = self.num_actions RLTrainer.__init__(self, parameters) self._create_all_q_score_net() self._create_internal_policy_net()
def __init__( self, state_normalization_parameters: Dict[str, NormalizationParameters], parameters: DiscreteActionModelParameters, skip_normalization: Optional[bool] = False ) -> None: self._actions = parameters.actions self.num_processed_state_features = get_num_output_features( state_normalization_parameters ) if parameters.training.layers[0] in [None, -1, 1]: parameters.training.layers[0] = self.num_state_features # There is a logical 1-dimensional output for each state/action pair, # but the underlying network computes num_actions-dimensional outputs if parameters.training.layers[-1] in [None, -1, 1]: parameters.training.layers[-1] = self.num_actions assert parameters.training.layers[-1] == self.num_actions,\ "Set layers[-1] to a the number of actions or a default placeholder value" RLTrainer.__init__( self, state_normalization_parameters, parameters, skip_normalization )
def __init__( self, parameters: ContinuousActionModelParameters, state_normalization_parameters: Dict[int, NormalizationParameters], action_normalization_parameters: Dict[int, NormalizationParameters], ) -> None: self.state_normalization_parameters = state_normalization_parameters self.action_normalization_parameters = action_normalization_parameters num_features = get_num_output_features( state_normalization_parameters) + get_num_output_features( action_normalization_parameters) parameters.training.layers[0] = num_features parameters.training.layers[-1] = 1 RLTrainer.__init__(self, parameters)
def build_actor( self, state_normalization_data: NormalizationData, action_normalization_data: NormalizationData, ) -> ModelBase: state_dim = get_num_output_features( state_normalization_data.dense_normalization_parameters) action_dim = get_num_output_features( action_normalization_data.dense_normalization_parameters) return DirichletFullyConnectedActor( state_dim=state_dim, action_dim=action_dim, sizes=self.config.sizes, activations=self.config.activations, use_batch_norm=self.config.use_batch_norm, )
def get_modular_sarsa_trainer_exporter(self, environment, parameters=None, use_gpu=False, use_all_avail_gpus=False): parameters = parameters or self.get_sarsa_parameters() q_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(environment.normalization), action_dim=get_num_output_features( environment.normalization_action), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) reward_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(environment.normalization), action_dim=get_num_output_features( environment.normalization_action), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) if use_gpu: q_network = q_network.cuda() reward_network = reward_network.cuda() if use_all_avail_gpus: q_network = q_network.get_data_parallel_model() reward_network = reward_network.get_data_parallel_model() q_network_target = q_network.get_target_network() trainer = _ParametricDQNTrainer(q_network, q_network_target, reward_network, parameters) state_preprocessor = Preprocessor(environment.normalization, False, True) action_preprocessor = Preprocessor(environment.normalization_action, False, True) feature_extractor = PredictorFeatureExtractor( state_normalization_parameters=environment.normalization, action_normalization_parameters=environment.normalization_action, ) output_transformer = ParametricActionOutputTransformer() exporter = ParametricDQNExporter( q_network, feature_extractor, output_transformer, state_preprocessor, action_preprocessor, ) return (trainer, exporter)
def build_q_network( self, state_normalization_parameters: Dict[int, NormalizationParameters], action_normalization_parameters: Dict[int, NormalizationParameters], output_dim: int = 1, ) -> ModelBase: state_dim = get_num_output_features(state_normalization_parameters) action_dim = get_num_output_features(action_normalization_parameters) return FullyConnectedParametricDQN( state_dim=state_dim, action_dim=action_dim, sizes=self.config.sizes, activations=self.config.activations, use_batch_norm=self.config.use_batch_norm, use_layer_norm=self.config.use_layer_norm, output_dim=output_dim, )
def normalize_dense_matrix( inputs: np.ndarray, features: List[str], normalization_params: Dict[str, NormalizationParameters], norm_blob_map: Dict[int, str], norm_net: core.Net, blobname_template: str, num_output_features: Optional[int] = None, ) -> np.ndarray: """ Normalizes inputs according to parameters. Expects a dense matrix whose ith column corresponds to feature i. Note that the Caffe2 BatchBoxCox operator isn't implemented on CUDA GPU so we need to use a CPU context. :param inputs: Numpy array with inputs to normalize. Should be of shape (any, num_features). :param features: Array of feature names. :param normalization_params: Mapping from feature names to NormalizationParameters. :param norm_blob_map: Dictionary that stores a mapping from feature index to input normalization blob name. :param norm_net: Caffe2 net for normalization. :param blobname_template: String template for input blobs to norm_net. :param num_output_features: The number of features in an output processed datapoint. If set to None, this function will compute it. """ num_input_features = len(features) num_output_features = \ num_output_features or get_num_output_features(normalization_params) assert inputs.shape[1] == num_input_features outputs = np.zeros((inputs.shape[0], num_output_features), dtype=np.float32) with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): for idx in range(num_input_features): input_blob = blobname_template.format(idx) workspace.FeedBlob(input_blob, inputs[:, idx]) workspace.RunNet(norm_net) output_col = 0 for idx, feature in enumerate(features): normalized_input_blob = norm_blob_map[idx] normalized_inputs = workspace.FetchBlob(normalized_input_blob) normalization_param = normalization_params[feature] if normalization_param.feature_type == identify_types.ENUM: next_output_col = output_col + len( normalization_param.possible_values ) outputs[:, output_col:next_output_col] = normalized_inputs else: next_output_col = output_col + 1 outputs[:, output_col] = normalized_inputs output_col = next_output_col return outputs
def build_value_network( self, state_normalization_data: NormalizationData) -> torch.nn.Module: state_dim = get_num_output_features( state_normalization_data.dense_normalization_parameters) return FullyConnectedNetwork( [state_dim] + self.config.sizes + [1], self.config.activations + ["linear"], use_layer_norm=self.config.use_layer_norm, )
def get_sac_trainer(self, env, parameters, use_gpu): state_dim = get_num_output_features(env.normalization) action_dim = get_num_output_features(env.normalization_action) q1_network = FullyConnectedParametricDQN( state_dim, action_dim, parameters.q_network.layers, parameters.q_network.activations, ) q2_network = None if parameters.training.use_2_q_functions: q2_network = FullyConnectedParametricDQN( state_dim, action_dim, parameters.q_network.layers, parameters.q_network.activations, ) value_network = FullyConnectedNetwork( [state_dim] + parameters.value_network.layers + [1], parameters.value_network.activations + ["linear"], ) actor_network = GaussianFullyConnectedActor( state_dim, action_dim, parameters.actor_network.layers, parameters.actor_network.activations, ) if use_gpu: q1_network.cuda() if q2_network: q2_network.cuda() value_network.cuda() actor_network.cuda() value_network_target = deepcopy(value_network) return SACTrainer( q1_network, value_network, value_network_target, actor_network, parameters, q2_network=q2_network, )
def __init__( self, parameters: DiscreteActionModelParameters, state_normalization_parameters: Dict[int, NormalizationParameters], use_gpu=False, additional_feature_types: AdditionalFeatureTypes = DEFAULT_ADDITIONAL_FEATURE_TYPES, gradient_handler=None, ) -> None: self.double_q_learning = parameters.rainbow.double_q_learning self.warm_start_model_path = parameters.training.warm_start_model_path self.minibatch_size = parameters.training.minibatch_size self._actions = parameters.actions if parameters.actions is not None else [] self.reward_shape = {} # type: Dict[int, float] if parameters.rl.reward_boost is not None and self._actions is not None: for k in parameters.rl.reward_boost.keys(): i = self._actions.index(k) self.reward_shape[i] = parameters.rl.reward_boost[k] if parameters.training.cnn_parameters is None: self.state_normalization_parameters: Optional[Dict[ int, NormalizationParameters]] = state_normalization_parameters self.num_features = get_num_output_features( state_normalization_parameters) parameters.training.layers[0] = self.num_features else: self.state_normalization_parameters = None parameters.training.layers[-1] = self.num_actions RLTrainer.__init__(self, parameters, use_gpu, additional_feature_types, gradient_handler) if parameters.rainbow.dueling_architecture: self.q_network = DuelingArchitectureQNetwork( parameters.training.layers, parameters.training.activations) else: self.q_network = GenericFeedForwardNetwork( parameters.training.layers, parameters.training.activations) self.q_network_target = deepcopy(self.q_network) self._set_optimizer(parameters.training.optimizer) self.q_network_optimizer = self.optimizer_func( self.q_network.parameters(), lr=parameters.training.learning_rate) self.reward_network = GenericFeedForwardNetwork( parameters.training.layers, parameters.training.activations) self.reward_network_optimizer = self.optimizer_func( self.reward_network.parameters(), lr=parameters.training.learning_rate) if self.use_gpu: self.q_network.cuda() self.q_network_target.cuda() self.reward_network.cuda()
def __init__( self, parameters: DiscreteActionModelParameters, normalization_parameters: Dict[int, NormalizationParameters], ) -> None: self._actions = parameters.actions if parameters.actions is not None else [] self.state_normalization_parameters = normalization_parameters num_features = get_num_output_features(normalization_parameters) parameters.training.layers[0] = num_features parameters.training.layers[-1] = self.num_actions RLTrainer.__init__(self, parameters)
def get_td3_trainer(self, env, parameters, use_gpu): state_dim = get_num_output_features(env.normalization) action_dim = get_num_output_features(env.normalization_action) q1_network = FullyConnectedParametricDQN( state_dim, action_dim, parameters.q_network.layers, parameters.q_network.activations, ) q2_network = None if parameters.training.use_2_q_functions: q2_network = FullyConnectedParametricDQN( state_dim, action_dim, parameters.q_network.layers, parameters.q_network.activations, ) actor_network = FullyConnectedActor( state_dim, action_dim, parameters.actor_network.layers, parameters.actor_network.activations, ) if use_gpu: q1_network.cuda() if q2_network: q2_network.cuda() actor_network.cuda() return TD3Trainer( q1_network, actor_network, parameters, q2_network=q2_network, use_gpu=use_gpu, )
def __init__( self, parameters: ContinuousActionModelParameters, state_normalization_parameters: Dict[int, NormalizationParameters], action_normalization_parameters: Dict[int, NormalizationParameters], ) -> None: self.state_normalization_parameters = state_normalization_parameters self.action_normalization_parameters = action_normalization_parameters num_features = get_num_output_features( state_normalization_parameters) + get_num_output_features( action_normalization_parameters) # ensure state and action IDs have no intersection overlapping_features = (set(state_normalization_parameters.keys()) & set(action_normalization_parameters.keys())) assert ( len(overlapping_features) == 0 ), "There are some overlapping state and action features: " + str( overlapping_features) parameters.training.layers[0] = num_features parameters.training.layers[-1] = 1 RLTrainer.__init__(self, parameters)
def get_modular_sarsa_trainer_exporter( self, environment, parameters=None, use_gpu=False, use_all_avail_gpus=False ): parameters = parameters or self.get_sarsa_parameters() q_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(environment.normalization), action_dim=get_num_output_features(environment.normalization_action), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) reward_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(environment.normalization), action_dim=get_num_output_features(environment.normalization_action), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) if use_gpu: q_network = q_network.cuda() reward_network = reward_network.cuda() if use_all_avail_gpus: q_network = q_network.get_data_parallel_model() reward_network = reward_network.get_data_parallel_model() q_network_target = q_network.get_target_network() trainer = _ParametricDQNTrainer( q_network, q_network_target, reward_network, parameters ) feature_extractor = PredictorFeatureExtractor( state_normalization_parameters=environment.normalization, action_normalization_parameters=environment.normalization_action, ) output_transformer = ParametricActionOutputTransformer() exporter = ParametricDQNExporter( q_network, feature_extractor, output_transformer ) return (trainer, exporter)
def __init__( self, parameters: DiscreteActionModelParameters, normalization_parameters: Dict[int, NormalizationParameters], ) -> None: self._actions = parameters.actions if parameters.actions is not None else [] self.reward_shape = {} # type: Dict[int, float] if parameters.rl.reward_boost is not None and self._actions is not None: for k in parameters.rl.reward_boost.keys(): i = self._actions.index(k) self.reward_shape[i] = parameters.rl.reward_boost[k] self.state_normalization_parameters = normalization_parameters num_features = get_num_output_features(normalization_parameters) parameters.training.layers[0] = num_features parameters.training.layers[-1] = self.num_actions RLTrainer.__init__(self, parameters) self._create_all_q_score_net()
def __init__( self, normalization_parameters: Dict[int, NormalizationParameters], use_gpu: bool, ) -> None: super().__init__() self.num_output_features = get_num_output_features( normalization_parameters) feature_types = { norm_param.feature_type for norm_param in normalization_parameters.values() } assert ( len(feature_types) == 1 ), "All dimensions of actions should have the same preprocessing" self.feature_type = list(feature_types)[0] assert self.feature_type in { CONTINUOUS_ACTION, DO_NOT_PREPROCESS, }, "Only support CONTINUOUS_ACTION & DO_NOT_PREPROCESS" self.device = torch.device( "cuda" if use_gpu else "cpu") # type: ignore if self.feature_type == CONTINUOUS_ACTION: sorted_features = sorted(normalization_parameters.keys()) self.min_serving_value = torch.tensor( [ normalization_parameters[f].min_value for f in sorted_features ], device=self.device, ) self.scaling_factor = torch.tensor( [ ( normalization_parameters[f].max_value # type: ignore - normalization_parameters[f].min_value # type: ignore ) / (2 * (1 - EPS)) for f in sorted_features ], device=self.device, )
def create_dqn_trainer_from_params( model: DiscreteActionModelParameters, normalization_parameters: Dict[int, NormalizationParameters], use_gpu: bool = False, use_all_avail_gpus: bool = False, metrics_to_score=None, ): metrics_to_score = metrics_to_score or [] if model.rainbow.quantile: q_network = QuantileDQN( state_dim=get_num_output_features(normalization_parameters), action_dim=len(model.actions), num_atoms=model.rainbow.num_atoms, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) elif model.rainbow.categorical: q_network = CategoricalDQN( # type: ignore state_dim=get_num_output_features(normalization_parameters), action_dim=len(model.actions), num_atoms=model.rainbow.num_atoms, qmin=model.rainbow.qmin, qmax=model.rainbow.qmax, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, use_gpu=use_gpu, ) elif model.rainbow.dueling_architecture: q_network = DuelingQNetwork( # type: ignore layers=[get_num_output_features(normalization_parameters)] + model.training.layers[1:-1] + [len(model.actions)], activations=model.training.activations, ) else: q_network = FullyConnectedDQN( # type: ignore state_dim=get_num_output_features(normalization_parameters), action_dim=len(model.actions), sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) if use_gpu and torch.cuda.is_available(): q_network = q_network.cuda() q_network_target = q_network.get_target_network() reward_network, q_network_cpe, q_network_cpe_target = None, None, None if model.evaluation.calc_cpe_in_training: # Metrics + reward num_output_nodes = (len(metrics_to_score) + 1) * len(model.actions) reward_network = FullyConnectedDQN( state_dim=get_num_output_features(normalization_parameters), action_dim=num_output_nodes, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) q_network_cpe = FullyConnectedDQN( state_dim=get_num_output_features(normalization_parameters), action_dim=num_output_nodes, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) if use_gpu and torch.cuda.is_available(): reward_network.cuda() q_network_cpe.cuda() q_network_cpe_target = q_network_cpe.get_target_network() if (use_all_avail_gpus and not model.rainbow.categorical and not model.rainbow.quantile): q_network = q_network.get_distributed_data_parallel_model() reward_network = (reward_network.get_distributed_data_parallel_model() if reward_network else None) q_network_cpe = (q_network_cpe.get_distributed_data_parallel_model() if q_network_cpe else None) if model.rainbow.quantile: assert (not use_all_avail_gpus ), "use_all_avail_gpus not implemented for distributional RL" return QRDQNTrainer( q_network, q_network_target, model, use_gpu, metrics_to_score=metrics_to_score, ) elif model.rainbow.categorical: assert (not use_all_avail_gpus ), "use_all_avail_gpus not implemented for distributional RL" return C51Trainer( q_network, q_network_target, model, use_gpu, metrics_to_score=metrics_to_score, ) else: return DQNTrainer( q_network, q_network_target, reward_network, model, use_gpu, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, metrics_to_score=metrics_to_score, )
def __init__( self, parameters, state_normalization_parameters: Dict[int, NormalizationParameters], action_normalization_parameters: Dict[int, NormalizationParameters], min_action_range_tensor_serving: torch.Tensor, max_action_range_tensor_serving: torch.Tensor, use_gpu: bool = False, additional_feature_types: AdditionalFeatureTypes = DEFAULT_ADDITIONAL_FEATURE_TYPES, use_all_avail_gpus: bool = False, ) -> None: self.state_normalization_parameters = state_normalization_parameters self.action_normalization_parameters = action_normalization_parameters for param in self.action_normalization_parameters.values(): assert param.feature_type == CONTINUOUS, ( "DDPG Actor features must be set to continuous (set to " + param.feature_type + ")" ) self.state_dim = get_num_output_features(state_normalization_parameters) self.action_dim = min_action_range_tensor_serving.shape[1] self.num_features = self.state_dim + self.action_dim # Actor generates actions between -1 and 1 due to tanh output layer so # convert actions to range [-1, 1] before training. self.min_action_range_tensor_training = torch.ones(1, self.action_dim) * -1 self.max_action_range_tensor_training = torch.ones(1, self.action_dim) self.min_action_range_tensor_serving = min_action_range_tensor_serving self.max_action_range_tensor_serving = max_action_range_tensor_serving # Shared params self.warm_start_model_path = parameters.shared_training.warm_start_model_path self.minibatch_size = parameters.shared_training.minibatch_size self.final_layer_init = parameters.shared_training.final_layer_init self._set_optimizer(parameters.shared_training.optimizer) # Actor params self.actor_params = parameters.actor_training assert ( self.actor_params.activations[-1] == "tanh" ), "Actor final layer activation must be tanh" self.actor_params.layers[0] = self.state_dim self.actor_params.layers[-1] = self.action_dim self.noise_generator = OrnsteinUhlenbeckProcessNoise(self.action_dim) self.actor = ActorNet( self.actor_params.layers, self.actor_params.activations, self.final_layer_init, ) self.actor_target = deepcopy(self.actor) self.actor_optimizer = self.optimizer_func( self.actor.parameters(), lr=self.actor_params.learning_rate, weight_decay=self.actor_params.l2_decay, ) self.noise = self.noise_generator # Critic params self.critic_params = parameters.critic_training self.critic_params.layers[0] = self.state_dim self.critic_params.layers[-1] = 1 self.critic = self.q_network = CriticNet( self.critic_params.layers, self.critic_params.activations, self.final_layer_init, self.action_dim, ) self.critic_target = deepcopy(self.critic) self.critic_optimizer = self.optimizer_func( self.critic.parameters(), lr=self.critic_params.learning_rate, weight_decay=self.critic_params.l2_decay, ) # ensure state and action IDs have no intersection overlapping_features = set(state_normalization_parameters.keys()) & set( action_normalization_parameters.keys() ) assert len(overlapping_features) == 0, ( "There are some overlapping state and action features: " + str(overlapping_features) ) RLTrainer.__init__(self, parameters, use_gpu, additional_feature_types, None) self.min_action_range_tensor_training = self.min_action_range_tensor_training.type( self.dtype ) self.max_action_range_tensor_training = self.max_action_range_tensor_training.type( self.dtype ) self.min_action_range_tensor_serving = self.min_action_range_tensor_serving.type( self.dtype ) self.max_action_range_tensor_serving = self.max_action_range_tensor_serving.type( self.dtype ) if self.use_gpu: self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() if use_all_avail_gpus: self.actor = nn.DataParallel(self.actor) self.actor_target = nn.DataParallel(self.actor_target) self.critic = nn.DataParallel(self.critic) self.critic_target = nn.DataParallel(self.critic_target)
def get_modular_sarsa_trainer_reward_boost( self, environment, reward_shape, dueling, categorical, quantile, use_gpu=False, use_all_avail_gpus=False, clip_grad_norm=None, ): assert not quantile or not categorical parameters = self.get_sarsa_parameters(environment, reward_shape, dueling, categorical, quantile, clip_grad_norm) if quantile: if dueling: q_network = DuelingQuantileDQN( layers=[ get_num_output_features(environment.normalization) ] + parameters.training.layers[1:-1] + [len(environment.ACTIONS)], activations=parameters.training.activations, num_atoms=parameters.rainbow.num_atoms, ) else: q_network = QuantileDQN( state_dim=get_num_output_features( environment.normalization), action_dim=len(environment.ACTIONS), num_atoms=parameters.rainbow.num_atoms, sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) elif categorical: assert not dueling q_network = CategoricalDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), num_atoms=parameters.rainbow.num_atoms, qmin=-100, qmax=200, sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) else: if dueling: q_network = DuelingQNetwork( layers=[ get_num_output_features(environment.normalization) ] + parameters.training.layers[1:-1] + [len(environment.ACTIONS)], activations=parameters.training.activations, ) else: q_network = FullyConnectedDQN( state_dim=get_num_output_features( environment.normalization), action_dim=len(environment.ACTIONS), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) q_network_cpe, q_network_cpe_target, reward_network = None, None, None if parameters.evaluation and parameters.evaluation.calc_cpe_in_training: q_network_cpe = FullyConnectedDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) q_network_cpe_target = q_network_cpe.get_target_network() reward_network = FullyConnectedDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) if use_gpu: q_network = q_network.cuda() if parameters.evaluation.calc_cpe_in_training: reward_network = reward_network.cuda() q_network_cpe = q_network_cpe.cuda() q_network_cpe_target = q_network_cpe_target.cuda() if use_all_avail_gpus and not categorical: q_network = q_network.get_distributed_data_parallel_model() reward_network = reward_network.get_distributed_data_parallel_model( ) q_network_cpe = q_network_cpe.get_distributed_data_parallel_model( ) q_network_cpe_target = ( q_network_cpe_target.get_distributed_data_parallel_model()) if quantile: trainer = QRDQNTrainer( q_network, q_network.get_target_network(), parameters, use_gpu, reward_network=reward_network, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, ) elif categorical: trainer = C51Trainer(q_network, q_network.get_target_network(), parameters, use_gpu) else: parameters = DQNTrainerParameters.from_discrete_action_model_parameters( parameters) trainer = DQNTrainer( q_network, q_network.get_target_network(), reward_network, parameters, use_gpu, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, ) return trainer
def __init__( self, parameters, state_normalization_parameters: Dict[int, NormalizationParameters], action_normalization_parameters: Dict[int, NormalizationParameters], min_action_range_tensor_serving: torch.Tensor, max_action_range_tensor_serving: torch.Tensor, use_gpu: bool = False, additional_feature_types: AdditionalFeatureTypes = DEFAULT_ADDITIONAL_FEATURE_TYPES, use_all_avail_gpus: bool = False, ) -> None: self.state_normalization_parameters = state_normalization_parameters self.action_normalization_parameters = action_normalization_parameters self.state_dim = get_num_output_features(state_normalization_parameters) self.action_dim = min_action_range_tensor_serving.shape[1] # Actor generates actions between -1 and 1 due to tanh output layer so # convert actions to range [-1, 1] before training. self.min_action_range_tensor_training = torch.ones(1, self.action_dim) * -1 self.max_action_range_tensor_training = torch.ones(1, self.action_dim) self.min_action_range_tensor_serving = min_action_range_tensor_serving self.max_action_range_tensor_serving = max_action_range_tensor_serving # Shared params self.warm_start_model_path = parameters.shared_training.warm_start_model_path self.minibatch_size = parameters.shared_training.minibatch_size self.final_layer_init = parameters.shared_training.final_layer_init self._set_optimizer(parameters.shared_training.optimizer) # Actor params self.actor_params = parameters.actor_training assert ( self.actor_params.activations[-1] == "tanh" ), "Actor final layer activation must be tanh" self.actor_params.layers[0] = self.state_dim self.actor_params.layers[-1] = self.action_dim self.noise_generator = OrnsteinUhlenbeckProcessNoise(self.action_dim) self.actor = ActorNet( self.actor_params.layers, self.actor_params.activations, self.final_layer_init, ) self.actor_target = deepcopy(self.actor) self.actor_optimizer = self.optimizer_func( self.actor.parameters(), lr=self.actor_params.learning_rate, weight_decay=self.actor_params.l2_decay, ) self.noise = self.noise_generator # Critic params self.critic_params = parameters.critic_training self.critic_params.layers[0] = self.state_dim self.critic_params.layers[-1] = 1 self.critic = CriticNet( self.critic_params.layers, self.critic_params.activations, self.final_layer_init, self.action_dim, ) self.critic_target = deepcopy(self.critic) self.critic_optimizer = self.optimizer_func( self.critic.parameters(), lr=self.critic_params.learning_rate, weight_decay=self.critic_params.l2_decay, ) RLTrainer.__init__(self, parameters, use_gpu, additional_feature_types, None) self.min_action_range_tensor_training = self.min_action_range_tensor_training.type( self.dtype ) self.max_action_range_tensor_training = self.max_action_range_tensor_training.type( self.dtype ) self.min_action_range_tensor_serving = self.min_action_range_tensor_serving.type( self.dtype ) self.max_action_range_tensor_serving = self.max_action_range_tensor_serving.type( self.dtype ) if self.use_gpu: self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() if use_all_avail_gpus: self.actor = nn.DataParallel(self.actor) self.actor_target = nn.DataParallel(self.actor_target) self.critic = nn.DataParallel(self.critic) self.critic_target = nn.DataParallel(self.critic_target)
def __init__( self, parameters: ContinuousActionModelParameters, state_normalization_parameters: Dict[int, NormalizationParameters], action_normalization_parameters: Dict[int, NormalizationParameters], use_gpu: bool = False, additional_feature_types: AdditionalFeatureTypes = DEFAULT_ADDITIONAL_FEATURE_TYPES, metrics_to_score=None, gradient_handler=None, use_all_avail_gpus: bool = False, ) -> None: self.double_q_learning = parameters.rainbow.double_q_learning self.warm_start_model_path = parameters.training.warm_start_model_path self.minibatch_size = parameters.training.minibatch_size self.state_normalization_parameters = state_normalization_parameters self.action_normalization_parameters = action_normalization_parameters self.num_state_features = get_num_output_features( state_normalization_parameters ) self.num_action_features = get_num_output_features( action_normalization_parameters ) self.num_features = self.num_state_features + self.num_action_features # ensure state and action IDs have no intersection overlapping_features = set(state_normalization_parameters.keys()) & set( action_normalization_parameters.keys() ) assert len(overlapping_features) == 0, ( "There are some overlapping state and action features: " + str(overlapping_features) ) reward_network_layers = deepcopy(parameters.training.layers) reward_network_layers[0] = self.num_features reward_network_layers[-1] = 1 if parameters.rainbow.dueling_architecture: parameters.training.layers[0] = self.num_state_features parameters.training.layers[-1] = 1 elif parameters.training.factorization_parameters is None: parameters.training.layers[0] = self.num_features parameters.training.layers[-1] = 1 else: parameters.training.factorization_parameters.state.layers[ 0 ] = self.num_state_features parameters.training.factorization_parameters.action.layers[ 0 ] = self.num_action_features RLTrainer.__init__( self, parameters, use_gpu, additional_feature_types, metrics_to_score, gradient_handler, ) self.q_network = self._get_model( parameters.training, parameters.rainbow.dueling_architecture ) self.q_network_target = deepcopy(self.q_network) self._set_optimizer(parameters.training.optimizer) self.q_network_optimizer = self.optimizer_func( self.q_network.parameters(), lr=parameters.training.learning_rate, weight_decay=parameters.training.l2_decay, ) self.reward_network = FullyConnectedNetwork( reward_network_layers, parameters.training.activations ) self.reward_network_optimizer = self.optimizer_func( self.reward_network.parameters(), lr=parameters.training.learning_rate ) if self.use_gpu: self.q_network.cuda() self.q_network_target.cuda() self.reward_network.cuda() if use_all_avail_gpus: self.q_network = torch.nn.DataParallel(self.q_network) self.q_network_target = torch.nn.DataParallel(self.q_network_target) self.reward_network = torch.nn.DataParallel(self.reward_network)