Ejemplo n.º 1
0
    def __init__(self, config, ob_space, ac_space, tanh_policy, deterministic=False, activation='relu', rl_hid_size=None, bias=None):
        super().__init__(config, ob_space, ac_space, tanh_policy)

        self._ac_space = ac_space
        self._bias = bias
        self._deterministic = deterministic
        if rl_hid_size == None:
            rl_hid_size = config.rl_hid_size

        # observation
        input_dim = observation_size(ob_space)

        self.fc = MLP(config, input_dim, rl_hid_size, [rl_hid_size]*config.actor_num_hid_layers, activation=activation)
        self.fc_means = nn.ModuleDict()
        self.fc_log_stds = nn.ModuleDict()

        for k, space in ac_space.spaces.items():
            if isinstance(space, spaces.Box):
                self.fc_means.update({k: MLP(config, rl_hid_size, action_size(space), activation=activation)})
                if not self._deterministic:
                    if config.algo == 'ppo':
                        self.fc_log_stds.update({k: AddBias(torch.zeros(action_size(space)))})
                    else:
                        self.fc_log_stds.update({k: MLP(config, rl_hid_size, action_size(space), activation=activation, bias=self._bias)})
            elif isinstance(space, spaces.Discrete):
                self.fc_means.update({k: MLP(config, rl_hid_size, space.n, activation=activation)})
            else:
                self.fc_means.update({k: MLP(config, rl_hid_size, space, activation=activation)})
Ejemplo n.º 2
0
 def clear(self):
     self._idx = 0
     self._current_size = 0
     buffer_size = self._size
     num_processes = self._num_processes
     self._obs = {
         k: np.empty((buffer_size, num_processes,
                      observation_size(self._ob_space[k])))
         for k in self._ob_space.spaces.keys()
     }
     self._obs_next = {
         k: np.empty((buffer_size, num_processes,
                      observation_size(self._ob_space[k])))
         for k in self._ob_space.spaces.keys()
     }
     self._actions = {
         k: np.empty(
             (buffer_size, num_processes, action_size(self._ac_space[k])))
         for k in self._ac_space.spaces.keys()
     }
     self._ac_before_activation = {
         k: np.empty(
             (buffer_size, num_processes, action_size(self._ac_space[k])))
         for k in self._ac_space.spaces.keys()
     }
     self._rewards = np.empty((buffer_size, num_processes, 1))
     self._terminals = np.empty((buffer_size, num_processes, 1))
     self._vpreds = np.empty((buffer_size, num_processes, 1))
     self._adv = np.empty((buffer_size, num_processes, 1))
     self._ret = np.empty((buffer_size, num_processes, 1))
     self._log_prob = np.empty((buffer_size, num_processes, 1))
Ejemplo n.º 3
0
    def __init__(self, config, num_processes, ob_space, ac_space):
        self._idx = 0
        self._current_size = 0
        self._config = config
        self._size = config.rollout_length
        self._ob_space = ob_space
        self._ac_space = ac_space
        self._num_processes = num_processes

        self._obs = {
            k: np.empty(
                (buffer_size, num_processes, observation_size(ob_space[k])))
            for k in ob_space.spaces.keys()
        }
        self._obs_next = {
            k: np.empty(
                (buffer_size, num_processes, observation_size(ob_space[k])))
            for k in ob_space.spaces.keys()
        }
        self._actions = {
            k: np.empty((buffer_size, num_processes, action_size(ac_space[k])))
            for k in ac_space.spaces.keys()
        }
        self._ac_before_activation = {
            k: np.empty((buffer_size, num_processes, action_size(ac_space[k])))
            for k in ac_space.spaces.keys()
        }
        self._rewards = np.empty((buffer_size, num_processes, 1))
        self._terminals = np.empty((buffer_size, num_processes, 1))
        self._vpreds = np.empty((buffer_size, num_processes, 1))
        self._adv = np.empty((buffer_size, num_processes, 1))
        self._ret = np.empty((buffer_size, num_processes, 1))
        self._log_prob = np.empty((buffer_size, num_processes, 1))
Ejemplo n.º 4
0
    def __init__(self, config, ob_space, ac_space,
                 actor, critic):
        super().__init__(config, ob_space)

        self._ob_space = ob_space
        self._ac_space = ac_space

        self._log_alpha = [torch.zeros(1, requires_grad=True, device=config.device)]
        self._alpha_optim = [optim.Adam([self._log_alpha[0]], lr=config.lr_actor)]

        self._actor = actor(self._config, self._ob_space,
                              self._ac_space, self._config.tanh_policy, deterministic=True)
        self._actor_target = actor(self._config, self._ob_space,
                              self._ac_space, self._config.tanh_policy, deterministic=True)
        self._actor_target.load_state_dict(self._actor.state_dict())
        self._critic = critic(config, ob_space, ac_space)
        self._critic_target = critic(config, ob_space, ac_space)
        self._critic_target.load_state_dict(self._critic.state_dict())

        self._network_cuda(config.device)

        self._actor_optim = optim.Adam(self._actor.parameters(), lr=config.lr_actor)
        self._critic_optim = optim.Adam(self._critic.parameters(), lr=config.lr_critic)

        self._buffer = ReplayBuffer(config,
                                    sampler.sample_func,
                                    ob_space,
                                    ac_space)

        self._ounoise = OUNoise(action_size(ac_space))

        self._log_creation()
Ejemplo n.º 5
0
    def __init__(self, config, ob_space, ac_space=None):
        super().__init__(config)

        self._ob_space = ob_space
        self._ac_space = ac_space
        self._activation_fn = nn.ReLU()


        input_shape = ob_space['default'].shape
        input_dim = input_shape[0]

        self.base = CNN(config, input_dim)

        self.aux_fc = nn.ModuleDict()
        out_size = config.encoder_feature_dim

        if ac_space is not None:
            out_size += action_size(ac_space)
        # For basiaclly subgoal
        self._aux_keys = []
        for k, space in self._ob_space.spaces.items():
            if len(space.shape) == 1:
                self.aux_fc.update({k: MLP(config, observation_size(space), config.rl_hid_size, [config.rl_hid_size])})
                out_size += config.rl_hid_size
                self._aux_keys.append(k)

        self.fc = MLP(config, out_size, 1, [config.rl_hid_size]*2)
Ejemplo n.º 6
0
    def __init__(
        self,
        config,
        ac_space,
        non_limited_idx=None,
        passive_joint_idx=[],
        ignored_contacts=[],
        planner_type=None,
        goal_bias=0.05,
        is_simplified=False,
        simplified_duration=0.1,
        range_=None,
    ):

        self._config = config
        self.planner = SamplingBasedPlanner(
            config,
            config._xml_path,
            action_size(ac_space),
            non_limited_idx,
            planner_type=planner_type,
            passive_joint_idx=passive_joint_idx,
            ignored_contacts=ignored_contacts,
            contact_threshold=config.contact_threshold,
            goal_bias=goal_bias,
            is_simplified=is_simplified,
            simplified_duration=simplified_duration,
            range_=range_,
        )

        self._is_simplified = is_simplified
        self._simplified_duration = simplified_duration
Ejemplo n.º 7
0
    def __init__(self, config, ob_space, ac_space=None, activation='relu', rl_hid_size=None):
        super().__init__(config)

        input_dim = observation_size(ob_space)
        if ac_space is not None:
            input_dim += action_size(ac_space)

        if rl_hid_size == None:
            rl_hid_size = config.rl_hid_size

        self.fc = MLP(config, input_dim, 1, [rl_hid_size] * 2, activation=activation)
Ejemplo n.º 8
0
    def __init__(self, config, ob_space, ac_space, tanh_policy, deterministic=False):
        super().__init__(config, ob_space, ac_space, tanh_policy, deterministic)

        self._ac_space = ac_space
        self._ob_space = ob_space
        self._deterministic = deterministic

        # observation
        # Change this later
        input_shape = ob_space['default'].shape
        input_dim = input_shape[0]

        self.base = CNN(config, input_dim)

        self.aux_fc = nn.ModuleDict()
        out_size = self.base.output_size

        # For basiaclly subgoal
        self._aux_keys = []
        for k, space in self._ob_space.spaces.items():
            if len(space.shape) == 1:
                self.aux_fc.update({k: MLP(config, observation_size(space), int(config.rl_hid_size/4))})
                out_size += config.rl_hid_size/4
                self._aux_keys.append(k)

        self.fc = MLP(config, config.encoder_feature_dim, config.rl_hid_size, [config.rl_hid_size], last_activation=True)
        self.fc_means = nn.ModuleDict()
        self.fc_log_stds = nn.ModuleDict()

        for k, space in self._ac_space.spaces.items():
            if isinstance(space, spaces.Box):
                self.fc_means.update({k: MLP(config, config.rl_hid_size, action_size(space))})
                if not self._deterministic:
                    self.fc_log_stds.update({k: MLP(config, config.rl_hid_size, action_size(space))})
            elif isinstance(space, spaces.Discrete):
                self.fc_means.update({k: MLP(config, config.rl_hid_size, space.n)})
            else:
                self.fc_means.update({k: MLP(config, config.rl_hid_size, space)})
Ejemplo n.º 9
0
    def __init__(self, config, ob_space, ac_space, actor, critic):
        super().__init__(config, ob_space)

        self._ob_space = ob_space
        self._ac_space = ac_space

        self._log_alpha = torch.tensor(np.log(config.alpha),
                                       requires_grad=True,
                                       device=config.device)
        self._alpha_optim = optim.Adam([self._log_alpha], lr=config.lr_actor)

        # build up networks
        self._actor = actor(config, ob_space, ac_space, config.tanh_policy)
        self._critic1 = critic(config, ob_space, ac_space)
        self._critic2 = critic(config, ob_space, ac_space)

        self._target_entropy = -action_size(self._actor._ac_space)

        # build up target networks
        self._critic1_target = critic(config, ob_space, ac_space)
        self._critic2_target = critic(config, ob_space, ac_space)
        self._critic1_target.load_state_dict(self._critic1.state_dict())
        self._critic2_target.load_state_dict(self._critic2.state_dict())

        if config.policy == 'cnn':
            self._critic2.base.copy_conv_weights_from(self._critic1.base)
            self._actor.base.copy_conv_weights_from(self._critic1.base)

            if config.unsup_algo == 'curl':
                self._curl = CURL(config, ob_space, ac_space, self._critic1,
                                  self._critic1_target)
                self._encoder_optim = optim.Adam(
                    self._critic1.base.parameters(), lr=config.lr_encoder)
                self._cpc_optim = optim.Adam(self._curl.parameters(),
                                             lr=config.lr_encoder)

        self._network_cuda(config.device)

        self._actor_optim = optim.Adam(self._actor.parameters(),
                                       lr=config.lr_actor)
        self._critic1_optim = optim.Adam(self._critic1.parameters(),
                                         lr=config.lr_critic)
        self._critic2_optim = optim.Adam(self._critic2.parameters(),
                                         lr=config.lr_critic)

        self._buffer = ReplayBuffer(config, ob_space, ac_space)
Ejemplo n.º 10
0
    def __init__(self, config, ob_space, ac_space):
        self._config = config
        self._size = config.buffer_size

        # memory management
        self._idx = 0
        self._current_size = 0

        # create the buffer to store info
        self._buffers = defaultdict(list)
        self._obs = {
            k: np.empty((self._size, *ob_space[k].shape))
            for k in ob_space.spaces.keys()
        }
        self._obs_next = {
            k: np.empty((self._size, *ob_space[k].shape))
            for k in ob_space.spaces.keys()
        }
        self._actions = {
            k: np.empty((self._size, action_size(ac_space[k])))
            for k in ac_space.spaces.keys()
        }
        self._rewards = np.empty((self._size, 1))
        self._terminals = np.empty((self._size, 1))
Ejemplo n.º 11
0
    def __init__(self, config):
        self._config = config
        self._is_chef = config.is_chef

        # create a new environment
        self._env = gym.make(config.env, **config.__dict__)
        self._env_eval = (gym.make(config.env,
                                   **copy.copy(config).__dict__)
                          if self._is_chef else None)
        self._config._xml_path = self._env.xml_path
        config.nq = self._env.sim.model.nq

        ob_space = self._env.observation_space
        ac_space = self._env.action_space
        joint_space = self._env.joint_space

        allowed_collsion_pairs = []
        for manipulation_geom_id in self._env.manipulation_geom_ids:
            for geom_id in self._env.static_geom_ids:
                allowed_collsion_pairs.append(
                    make_ordered_pair(manipulation_geom_id, geom_id))

        ignored_contact_geom_ids = []
        ignored_contact_geom_ids.extend(allowed_collsion_pairs)
        config.ignored_contact_geom_ids = ignored_contact_geom_ids

        passive_joint_idx = list(range(len(self._env.sim.data.qpos)))
        [
            passive_joint_idx.remove(idx)
            for idx in self._env.ref_joint_pos_indexes
        ]
        config.passive_joint_idx = passive_joint_idx

        # get actor and critic networks
        actor, critic = get_actor_critic_by_name(config.policy)

        # build up networks
        non_limited_idx = np.where(
            self._env.sim.model.jnt_limited[:action_size(self._env.action_space
                                                         )] == 0)[0]
        meta_ac_space = joint_space

        sampler = None

        ll_ob_space = ob_space
        if config.mopa:
            if config.discrete_action:
                ac_space.spaces["ac_type"] = spaces.Discrete(2)

        if config.use_ik_target:
            if action_size(ac_space) == len(self._env.ref_joint_pos_indexes):
                ac_space = spaces.Dict([(
                    "default",
                    spaces.Box(
                        low=np.ones(len(self._env.min_world_size)) * -1,
                        high=np.ones(len(self._env.max_world_size)),
                        dtype=np.float32,
                    ),
                )])
                if len(self._env.min_world_size) == 3:
                    ac_space.spaces["quat"] = spaces.Box(low=np.ones(4) * -1,
                                                         high=np.ones(4),
                                                         dtype=np.float32)
            else:
                ac_space = spaces.Dict([
                    (
                        "default",
                        spaces.Box(low=np.ones(3) * -1,
                                   high=np.ones(3),
                                   dtype=np.float32),
                    ),
                    (
                        "quat",
                        spaces.Box(low=np.ones(4) * -1,
                                   high=np.ones(4),
                                   dtype=np.float32),
                    ),
                    (
                        "gripper",
                        spaces.Box(
                            low=np.array([-1.0]),
                            high=np.array([1.0]),
                            dtype=np.float32,
                        ),
                    ),
                ])

        ac_space.seed(config.seed)
        self._agent = get_agent_by_name(config.algo)(
            config,
            ob_space,
            ac_space,
            actor,
            critic,
            non_limited_idx,
            self._env.ref_joint_pos_indexes,
            self._env.joint_space,
            self._env._is_jnt_limited,
            self._env.jnt_indices,
        )

        self._agent._ac_space.seed(config.seed)

        self._runner = None
        if config.mopa:
            self._runner = MoPARolloutRunner(config, self._env, self._env_eval,
                                             self._agent)
        else:
            self._runner = RolloutRunner(config, self._env, self._env_eval,
                                         self._agent)

        # setup wandb
        if self._is_chef and self._config.is_train and self._config.wandb:
            exclude = ["device"]
            if config.debug:
                os.environ["WANDB_MODE"] = "dryrun"

            tags = [config.env, config.algo, config.reward_type]
            assert (config.entity != None and config.project != None
                    ), "Entity and Project name must be specified"

            wandb.init(
                resume=config.run_name,
                project=config.project,
                config={
                    k: v
                    for k, v in config.__dict__.items() if k not in exclude
                },
                dir=config.log_dir,
                entity=config.entity,
                notes=config.notes,
                tags=tags,
                group=config.group,
            )
Ejemplo n.º 12
0
    def _update_network(self, transitions, step=0):
        info = {}

        # pre-process observations
        _to_tensor = lambda x: to_tensor(x, self._config.device)
        o, o_next = transitions["ob"], transitions["ob_next"]
        bs = len(transitions["done"])
        o = _to_tensor(o)
        o_next = _to_tensor(o_next)
        ac = _to_tensor(transitions["ac"])

        if "intra_steps" in transitions.keys(
        ) and self._config.use_smdp_update:
            intra_steps = _to_tensor(transitions["intra_steps"])

        done = _to_tensor(transitions["done"]).reshape(bs, 1)
        rew = _to_tensor(transitions["rew"]).reshape(bs, 1)

        actions_real, log_pi = self.act_log(o)
        alpha_loss = -(self._log_alpha.exp() *
                       (log_pi + self._target_entropy).detach()).mean()

        self._alpha_optim.zero_grad()
        alpha_loss.backward()
        self._alpha_optim.step()
        alpha = self._log_alpha.exp()
        info["alpha_loss"] = alpha_loss.cpu().item()
        info["entropy_alpha"] = alpha.cpu().item()
        alpha = self._log_alpha.exp()

        # the actor loss
        entropy_loss = (alpha * log_pi).mean()
        actor_loss = -torch.min(self._critic1(o, actions_real),
                                self._critic2(o, actions_real)).mean()
        info["log_pi"] = log_pi.mean().cpu().item()
        info["entropy_loss"] = entropy_loss.cpu().item()
        info["actor_loss"] = actor_loss.cpu().item()
        actor_loss += entropy_loss

        # calculate the target Q value function
        with torch.no_grad():
            actions_next, log_pi_next = self.act_log(o_next)
            q_next_value1 = self._critic1_target(o_next, actions_next)
            q_next_value2 = self._critic2_target(o_next, actions_next)
            q_next_value = torch.min(q_next_value1,
                                     q_next_value2) - alpha * log_pi_next
            if self._config.use_smdp_update:
                target_q_value = (self._config.reward_scale * rew +
                                  (1 - done) *
                                  (self._config.discount_factor**
                                   (intra_steps + 1)) * q_next_value)
            else:
                target_q_value = (
                    self._config.reward_scale * rew +
                    (1 - done) * self._config.discount_factor * q_next_value)
            target_q_value = target_q_value.detach()

        # the q loss
        for k, space in self._ac_space.spaces.items():
            if isinstance(space, spaces.Discrete):
                ac[k] = (F.one_hot(ac[k].long(), action_size(
                    self._ac_space[k])).float().squeeze(1))
        real_q_value1 = self._critic1(o, ac)
        real_q_value2 = self._critic2(o, ac)
        critic1_loss = 0.5 * (target_q_value - real_q_value1).pow(2).mean()
        critic2_loss = 0.5 * (target_q_value - real_q_value2).pow(2).mean()

        info["min_target_q"] = target_q_value.min().cpu().item()
        info["target_q"] = target_q_value.mean().cpu().item()
        info["min_real1_q"] = real_q_value1.min().cpu().item()
        info["min_real2_q"] = real_q_value2.min().cpu().item()
        info["real1_q"] = real_q_value1.mean().cpu().item()
        info["real2_q"] = real_q_value2.mean().cpu().item()
        info["critic1_loss"] = critic1_loss.cpu().item()
        info["critic2_loss"] = critic2_loss.cpu().item()

        # update the actor
        self._actor_optim.zero_grad()
        actor_loss.backward()
        if self._config.is_mpi:
            sync_grads(self._actor)
        self._actor_optim.step()

        # update the critic
        self._critic1_optim.zero_grad()
        critic1_loss.backward()
        if self._config.is_mpi:
            sync_grads(self._critic1)
        self._critic1_optim.step()

        self._critic2_optim.zero_grad()
        critic2_loss.backward()
        if self._config.is_mpi:
            sync_grads(self, _critic2)
        self._critic2_optim.step()

        if self._config.is_mpi:
            return mpi_average(info)
        else:
            return info
Ejemplo n.º 13
0
    def __init__(
        self,
        config,
        ob_space,
        ac_space,
        actor,
        critic,
        non_limited_idx=None,
        ref_joint_pos_indexes=None,
        joint_space=None,
        is_jnt_limited=None,
        jnt_indices=None,
    ):
        super().__init__(config, ob_space)

        self._ob_space = ob_space
        self._ac_space = ac_space
        self._jnt_indices = jnt_indices
        self._ref_joint_pos_indexes = ref_joint_pos_indexes
        self._log_alpha = torch.tensor(np.log(config.alpha),
                                       requires_grad=True,
                                       device=config.device)
        self._alpha_optim = optim.Adam([self._log_alpha], lr=config.lr_actor)
        self._joint_space = joint_space
        self._is_jnt_limited = is_jnt_limited
        if joint_space is not None:
            self._jnt_minimum = joint_space["default"].low
            self._jnt_maximum = joint_space["default"].high

        # build up networks
        self._build_actor(actor)
        self._build_critic(critic)
        self._network_cuda(config.device)

        self._target_entropy = -action_size(self._actor._ac_space)

        self._actor_optim = optim.Adam(self._actor.parameters(),
                                       lr=config.lr_actor)
        self._critic1_optim = optim.Adam(self._critic1.parameters(),
                                         lr=config.lr_critic)
        self._critic2_optim = optim.Adam(self._critic2.parameters(),
                                         lr=config.lr_critic)

        sampler = RandomSampler()
        buffer_keys = ["ob", "ac", "meta_ac", "done", "rew"]
        if config.mopa or config.expand_ac_space:
            buffer_keys.append("intra_steps")
        self._buffer = ReplayBuffer(buffer_keys, config.buffer_size,
                                    sampler.sample_func)

        self._log_creation()

        self._planner = None
        self._is_planner_initialized = False
        if config.mopa:
            self._planner = PlannerAgent(
                config,
                ac_space,
                non_limited_idx,
                planner_type=config.planner_type,
                passive_joint_idx=config.passive_joint_idx,
                ignored_contacts=config.ignored_contact_geom_ids,
                is_simplified=config.is_simplified,
                simplified_duration=config.simplified_duration,
                range_=config.range,
            )
            self._simple_planner = PlannerAgent(
                config,
                ac_space,
                non_limited_idx,
                planner_type=config.simple_planner_type,
                passive_joint_idx=config.passive_joint_idx,
                ignored_contacts=config.ignored_contact_geom_ids,
                goal_bias=1.0,
                is_simplified=config.simple_planner_simplified,
                simplified_duration=config.simple_planner_simplified_duration,
                range_=config.simple_planner_range,
            )
            self._omega = config.omega