def __init__(self, kwargs): super().__init__() self.env_name = str(kwargs["env_name"]) self.device = str(kwargs["device"]) self.state_dim = int(kwargs["state_dim"]) self.action_dim = int(kwargs["action_dim"]) self.solved_reward = float(kwargs["solved_reward"]) # for gym compatibility self._max_episode_steps = int(kwargs["max_steps"]) self.action_space = kwargs["action_space"] self.observation_space = kwargs["observation_space"] self.reset_env = kwargs["reset_env"] self.state_net = build_nn_from_config(input_dim=self.state_dim + self.action_dim, output_dim=self.state_dim, nn_config=kwargs).to(self.device) self.reward_net = build_nn_from_config(input_dim=self.state_dim + self.action_dim, output_dim=1, nn_config=kwargs).to(self.device) self.done_net = build_nn_from_config(input_dim=self.state_dim + self.action_dim, output_dim=1, nn_config=kwargs).to(self.device) self.state = self.reset()
def __init__(self, state_dim, action_dim, max_action, agent_name, config): super().__init__() self.net = build_nn_from_config(input_dim=state_dim, output_dim=action_dim, nn_config=config["agents"][agent_name]) self.max_action = max_action
def build_reward_net(self, kwargs): # 0: original reward # 1: potential function (exclusive) # 2: potential function (additive) # 3: potential function with additional info vector (exclusive) # 4: potential function with additional info vector (additive) # 5: non-potential function (exclusive) # 6: non-potential function (additive) # 7: non-potential function with additional info vector (exclusive) # 8: non-potential function with additional info vector (additive) # 101: weighted info vector as baseline (exclusive) # 102: weighted info vector as baseline (additive) if self.reward_env_type < 100: if self.reward_env_type == 0: input_dim = 1 # dummy dimension elif self.reward_env_type == 1 or self.reward_env_type == 2 or self.reward_env_type == 5 or self.reward_env_type == 6: input_dim = self.state_dim elif self.reward_env_type == 3 or self.reward_env_type == 4 or self.reward_env_type == 7 or self.reward_env_type == 8: input_dim = self.state_dim + self.info_dim else: raise NotImplementedError('Unknown reward_env_type: ' + str(self.reward_env_type)) return build_nn_from_config(input_dim=input_dim, output_dim=1, nn_config=kwargs).to(self.device) else: if self.reward_env_type == 101 or self.reward_env_type == 102: return nn.Linear(self.info_dim, 1, bias=False).to(self.device) else: raise NotImplementedError('Unknown reward_env_type: ' + str(self.reward_env_type))
def __init__(self, state_dim, action_dim, agent_name, config): super().__init__() self.net = build_nn_from_config(input_dim=state_dim, output_dim=action_dim, nn_config=config["agents"][agent_name]) self.action_std = torch.nn.Parameter( torch.ones(action_dim, device=config["device"]) * config["agents"][agent_name]["action_std"])
def __init__(self, state_dim, action_dim, max_action, agent_name, config): super().__init__() self.net = build_nn_from_config(input_dim=state_dim, output_dim=action_dim, nn_config=config["agents"][agent_name]) self.max_action = max_action # gumbel_softmax_temp = config["agents"][agent_name]["gumbel_softmax_temp"] # self.gumbel_softmax_temp = torch.nn.Parameter(torch.tensor(gumbel_softmax_temp), requires_grad=True) self.gumbel_softmax_temp = config["agents"][agent_name][ "gumbel_softmax_temp"] self.gumbel_softmax_hard = config["agents"][agent_name][ "gumbel_softmax_hard"]
def __init__(self, state_dim, action_dim, max_action, agent_name, config): super().__init__() self.net = build_nn_from_config(input_dim=state_dim, output_dim=1, nn_config=config["agents"][agent_name]) self.output_limit = max_action self.log_std_min = config["agents"][agent_name]['log_std_min'] self.log_std_max = config["agents"][agent_name]['log_std_max'] # Set output layers self.mu_layer = nn.Linear(action_dim, action_dim) self.log_std_layer = nn.Linear(action_dim, action_dim)
def __init__(self, state_dim, action_dim, agent_name, config): super().__init__() self.feature_stream = build_nn_from_config( input_dim=state_dim, output_dim=config["agents"][agent_name]["feature_dim"], nn_config=config["agents"][agent_name]) heads_config = copy.copy(config["agents"][agent_name]) heads_config["hidden_layer"] = 1 heads_config["hidden_size"] = config["agents"][agent_name][ "feature_dim"] self.value_stream = build_nn_from_config( input_dim=config["agents"][agent_name]["feature_dim"], output_dim=1, nn_config=heads_config) self.advantage_stream = build_nn_from_config( input_dim=config["agents"][agent_name]["feature_dim"], output_dim=action_dim, nn_config=heads_config)
def __init__( self, state_dim, action_dim, has_discrete_actions, feature_dim=64, hidden_size=128, ): super(ICMModel, self).__init__() self.state_dim = state_dim self.action_dim = action_dim self.feature_dim = feature_dim self.has_discrete_actions = has_discrete_actions nn_features_config = { 'hidden_size': hidden_size, 'hidden_layer': 2, 'activation_fn': "leakyrelu" } nn_inverse_config = { 'hidden_size': hidden_size, 'hidden_layer': 2, 'activation_fn': "relu" } nn_forward_pre_config = { 'hidden_size': hidden_size, 'hidden_layer': 2, 'activation_fn': "leakyrelu" } nn_forward_post_config = { 'hidden_size': hidden_size, 'hidden_layer': 1, 'activation_fn': "leakyrelu" } if self.has_discrete_actions and self.action_dim == 2: action_dim = 1 self.features_model = build_nn_from_config( input_dim=state_dim, output_dim=feature_dim, nn_config=nn_features_config) self.inverse_model = build_nn_from_config(input_dim=feature_dim * 2, output_dim=action_dim, nn_config=nn_inverse_config) self.forward_pre_model = build_nn_from_config( input_dim=action_dim + feature_dim, output_dim=feature_dim, nn_config=nn_forward_pre_config) class ResidualBlock(nn.Module): def __init__(self, input_dim, output_dim): super().__init__() self.fc1 = nn.Sequential( nn.Linear(input_dim, output_dim), nn.LeakyReLU(inplace=True), ) self.fc2 = nn.Sequential(nn.Linear(input_dim, output_dim)) def forward(self, feature, action): x = feature x = self.fc1(torch.cat([x, action], dim=1)) x = self.fc2(torch.cat([x, action], dim=1)) return feature + x # original implementation uses residual blocks: # https://github.com/openai/large-scale-curiosity/blob/master/dynamics.py#L55-L61 self.residual_block1 = ResidualBlock(input_dim=action_dim + feature_dim, output_dim=feature_dim) self.residual_block2 = ResidualBlock(input_dim=action_dim + feature_dim, output_dim=feature_dim) self.residual_block3 = ResidualBlock(input_dim=action_dim + feature_dim, output_dim=feature_dim) self.residual_block4 = ResidualBlock(input_dim=action_dim + feature_dim, output_dim=feature_dim) self.forward_post_model = build_nn_from_config( input_dim=feature_dim, output_dim=feature_dim, nn_config=nn_forward_post_config)