def __init__(self, config, ob_space, ac_space, tanh_policy, deterministic=False, activation='relu', rl_hid_size=None, bias=None): super().__init__(config, ob_space, ac_space, tanh_policy) self._ac_space = ac_space self._bias = bias self._deterministic = deterministic if rl_hid_size == None: rl_hid_size = config.rl_hid_size # observation input_dim = observation_size(ob_space) self.fc = MLP(config, input_dim, rl_hid_size, [rl_hid_size]*config.actor_num_hid_layers, activation=activation) self.fc_means = nn.ModuleDict() self.fc_log_stds = nn.ModuleDict() for k, space in ac_space.spaces.items(): if isinstance(space, spaces.Box): self.fc_means.update({k: MLP(config, rl_hid_size, action_size(space), activation=activation)}) if not self._deterministic: if config.algo == 'ppo': self.fc_log_stds.update({k: AddBias(torch.zeros(action_size(space)))}) else: self.fc_log_stds.update({k: MLP(config, rl_hid_size, action_size(space), activation=activation, bias=self._bias)}) elif isinstance(space, spaces.Discrete): self.fc_means.update({k: MLP(config, rl_hid_size, space.n, activation=activation)}) else: self.fc_means.update({k: MLP(config, rl_hid_size, space, activation=activation)})
def clear(self): self._idx = 0 self._current_size = 0 buffer_size = self._size num_processes = self._num_processes self._obs = { k: np.empty((buffer_size, num_processes, observation_size(self._ob_space[k]))) for k in self._ob_space.spaces.keys() } self._obs_next = { k: np.empty((buffer_size, num_processes, observation_size(self._ob_space[k]))) for k in self._ob_space.spaces.keys() } self._actions = { k: np.empty( (buffer_size, num_processes, action_size(self._ac_space[k]))) for k in self._ac_space.spaces.keys() } self._ac_before_activation = { k: np.empty( (buffer_size, num_processes, action_size(self._ac_space[k]))) for k in self._ac_space.spaces.keys() } self._rewards = np.empty((buffer_size, num_processes, 1)) self._terminals = np.empty((buffer_size, num_processes, 1)) self._vpreds = np.empty((buffer_size, num_processes, 1)) self._adv = np.empty((buffer_size, num_processes, 1)) self._ret = np.empty((buffer_size, num_processes, 1)) self._log_prob = np.empty((buffer_size, num_processes, 1))
def __init__(self, config, num_processes, ob_space, ac_space): self._idx = 0 self._current_size = 0 self._config = config self._size = config.rollout_length self._ob_space = ob_space self._ac_space = ac_space self._num_processes = num_processes self._obs = { k: np.empty( (buffer_size, num_processes, observation_size(ob_space[k]))) for k in ob_space.spaces.keys() } self._obs_next = { k: np.empty( (buffer_size, num_processes, observation_size(ob_space[k]))) for k in ob_space.spaces.keys() } self._actions = { k: np.empty((buffer_size, num_processes, action_size(ac_space[k]))) for k in ac_space.spaces.keys() } self._ac_before_activation = { k: np.empty((buffer_size, num_processes, action_size(ac_space[k]))) for k in ac_space.spaces.keys() } self._rewards = np.empty((buffer_size, num_processes, 1)) self._terminals = np.empty((buffer_size, num_processes, 1)) self._vpreds = np.empty((buffer_size, num_processes, 1)) self._adv = np.empty((buffer_size, num_processes, 1)) self._ret = np.empty((buffer_size, num_processes, 1)) self._log_prob = np.empty((buffer_size, num_processes, 1))
def __init__(self, config, ob_space, ac_space, actor, critic): super().__init__(config, ob_space) self._ob_space = ob_space self._ac_space = ac_space self._log_alpha = [torch.zeros(1, requires_grad=True, device=config.device)] self._alpha_optim = [optim.Adam([self._log_alpha[0]], lr=config.lr_actor)] self._actor = actor(self._config, self._ob_space, self._ac_space, self._config.tanh_policy, deterministic=True) self._actor_target = actor(self._config, self._ob_space, self._ac_space, self._config.tanh_policy, deterministic=True) self._actor_target.load_state_dict(self._actor.state_dict()) self._critic = critic(config, ob_space, ac_space) self._critic_target = critic(config, ob_space, ac_space) self._critic_target.load_state_dict(self._critic.state_dict()) self._network_cuda(config.device) self._actor_optim = optim.Adam(self._actor.parameters(), lr=config.lr_actor) self._critic_optim = optim.Adam(self._critic.parameters(), lr=config.lr_critic) self._buffer = ReplayBuffer(config, sampler.sample_func, ob_space, ac_space) self._ounoise = OUNoise(action_size(ac_space)) self._log_creation()
def __init__(self, config, ob_space, ac_space=None): super().__init__(config) self._ob_space = ob_space self._ac_space = ac_space self._activation_fn = nn.ReLU() input_shape = ob_space['default'].shape input_dim = input_shape[0] self.base = CNN(config, input_dim) self.aux_fc = nn.ModuleDict() out_size = config.encoder_feature_dim if ac_space is not None: out_size += action_size(ac_space) # For basiaclly subgoal self._aux_keys = [] for k, space in self._ob_space.spaces.items(): if len(space.shape) == 1: self.aux_fc.update({k: MLP(config, observation_size(space), config.rl_hid_size, [config.rl_hid_size])}) out_size += config.rl_hid_size self._aux_keys.append(k) self.fc = MLP(config, out_size, 1, [config.rl_hid_size]*2)
def __init__( self, config, ac_space, non_limited_idx=None, passive_joint_idx=[], ignored_contacts=[], planner_type=None, goal_bias=0.05, is_simplified=False, simplified_duration=0.1, range_=None, ): self._config = config self.planner = SamplingBasedPlanner( config, config._xml_path, action_size(ac_space), non_limited_idx, planner_type=planner_type, passive_joint_idx=passive_joint_idx, ignored_contacts=ignored_contacts, contact_threshold=config.contact_threshold, goal_bias=goal_bias, is_simplified=is_simplified, simplified_duration=simplified_duration, range_=range_, ) self._is_simplified = is_simplified self._simplified_duration = simplified_duration
def __init__(self, config, ob_space, ac_space=None, activation='relu', rl_hid_size=None): super().__init__(config) input_dim = observation_size(ob_space) if ac_space is not None: input_dim += action_size(ac_space) if rl_hid_size == None: rl_hid_size = config.rl_hid_size self.fc = MLP(config, input_dim, 1, [rl_hid_size] * 2, activation=activation)
def __init__(self, config, ob_space, ac_space, tanh_policy, deterministic=False): super().__init__(config, ob_space, ac_space, tanh_policy, deterministic) self._ac_space = ac_space self._ob_space = ob_space self._deterministic = deterministic # observation # Change this later input_shape = ob_space['default'].shape input_dim = input_shape[0] self.base = CNN(config, input_dim) self.aux_fc = nn.ModuleDict() out_size = self.base.output_size # For basiaclly subgoal self._aux_keys = [] for k, space in self._ob_space.spaces.items(): if len(space.shape) == 1: self.aux_fc.update({k: MLP(config, observation_size(space), int(config.rl_hid_size/4))}) out_size += config.rl_hid_size/4 self._aux_keys.append(k) self.fc = MLP(config, config.encoder_feature_dim, config.rl_hid_size, [config.rl_hid_size], last_activation=True) self.fc_means = nn.ModuleDict() self.fc_log_stds = nn.ModuleDict() for k, space in self._ac_space.spaces.items(): if isinstance(space, spaces.Box): self.fc_means.update({k: MLP(config, config.rl_hid_size, action_size(space))}) if not self._deterministic: self.fc_log_stds.update({k: MLP(config, config.rl_hid_size, action_size(space))}) elif isinstance(space, spaces.Discrete): self.fc_means.update({k: MLP(config, config.rl_hid_size, space.n)}) else: self.fc_means.update({k: MLP(config, config.rl_hid_size, space)})
def __init__(self, config, ob_space, ac_space, actor, critic): super().__init__(config, ob_space) self._ob_space = ob_space self._ac_space = ac_space self._log_alpha = torch.tensor(np.log(config.alpha), requires_grad=True, device=config.device) self._alpha_optim = optim.Adam([self._log_alpha], lr=config.lr_actor) # build up networks self._actor = actor(config, ob_space, ac_space, config.tanh_policy) self._critic1 = critic(config, ob_space, ac_space) self._critic2 = critic(config, ob_space, ac_space) self._target_entropy = -action_size(self._actor._ac_space) # build up target networks self._critic1_target = critic(config, ob_space, ac_space) self._critic2_target = critic(config, ob_space, ac_space) self._critic1_target.load_state_dict(self._critic1.state_dict()) self._critic2_target.load_state_dict(self._critic2.state_dict()) if config.policy == 'cnn': self._critic2.base.copy_conv_weights_from(self._critic1.base) self._actor.base.copy_conv_weights_from(self._critic1.base) if config.unsup_algo == 'curl': self._curl = CURL(config, ob_space, ac_space, self._critic1, self._critic1_target) self._encoder_optim = optim.Adam( self._critic1.base.parameters(), lr=config.lr_encoder) self._cpc_optim = optim.Adam(self._curl.parameters(), lr=config.lr_encoder) self._network_cuda(config.device) self._actor_optim = optim.Adam(self._actor.parameters(), lr=config.lr_actor) self._critic1_optim = optim.Adam(self._critic1.parameters(), lr=config.lr_critic) self._critic2_optim = optim.Adam(self._critic2.parameters(), lr=config.lr_critic) self._buffer = ReplayBuffer(config, ob_space, ac_space)
def __init__(self, config, ob_space, ac_space): self._config = config self._size = config.buffer_size # memory management self._idx = 0 self._current_size = 0 # create the buffer to store info self._buffers = defaultdict(list) self._obs = { k: np.empty((self._size, *ob_space[k].shape)) for k in ob_space.spaces.keys() } self._obs_next = { k: np.empty((self._size, *ob_space[k].shape)) for k in ob_space.spaces.keys() } self._actions = { k: np.empty((self._size, action_size(ac_space[k]))) for k in ac_space.spaces.keys() } self._rewards = np.empty((self._size, 1)) self._terminals = np.empty((self._size, 1))
def __init__(self, config): self._config = config self._is_chef = config.is_chef # create a new environment self._env = gym.make(config.env, **config.__dict__) self._env_eval = (gym.make(config.env, **copy.copy(config).__dict__) if self._is_chef else None) self._config._xml_path = self._env.xml_path config.nq = self._env.sim.model.nq ob_space = self._env.observation_space ac_space = self._env.action_space joint_space = self._env.joint_space allowed_collsion_pairs = [] for manipulation_geom_id in self._env.manipulation_geom_ids: for geom_id in self._env.static_geom_ids: allowed_collsion_pairs.append( make_ordered_pair(manipulation_geom_id, geom_id)) ignored_contact_geom_ids = [] ignored_contact_geom_ids.extend(allowed_collsion_pairs) config.ignored_contact_geom_ids = ignored_contact_geom_ids passive_joint_idx = list(range(len(self._env.sim.data.qpos))) [ passive_joint_idx.remove(idx) for idx in self._env.ref_joint_pos_indexes ] config.passive_joint_idx = passive_joint_idx # get actor and critic networks actor, critic = get_actor_critic_by_name(config.policy) # build up networks non_limited_idx = np.where( self._env.sim.model.jnt_limited[:action_size(self._env.action_space )] == 0)[0] meta_ac_space = joint_space sampler = None ll_ob_space = ob_space if config.mopa: if config.discrete_action: ac_space.spaces["ac_type"] = spaces.Discrete(2) if config.use_ik_target: if action_size(ac_space) == len(self._env.ref_joint_pos_indexes): ac_space = spaces.Dict([( "default", spaces.Box( low=np.ones(len(self._env.min_world_size)) * -1, high=np.ones(len(self._env.max_world_size)), dtype=np.float32, ), )]) if len(self._env.min_world_size) == 3: ac_space.spaces["quat"] = spaces.Box(low=np.ones(4) * -1, high=np.ones(4), dtype=np.float32) else: ac_space = spaces.Dict([ ( "default", spaces.Box(low=np.ones(3) * -1, high=np.ones(3), dtype=np.float32), ), ( "quat", spaces.Box(low=np.ones(4) * -1, high=np.ones(4), dtype=np.float32), ), ( "gripper", spaces.Box( low=np.array([-1.0]), high=np.array([1.0]), dtype=np.float32, ), ), ]) ac_space.seed(config.seed) self._agent = get_agent_by_name(config.algo)( config, ob_space, ac_space, actor, critic, non_limited_idx, self._env.ref_joint_pos_indexes, self._env.joint_space, self._env._is_jnt_limited, self._env.jnt_indices, ) self._agent._ac_space.seed(config.seed) self._runner = None if config.mopa: self._runner = MoPARolloutRunner(config, self._env, self._env_eval, self._agent) else: self._runner = RolloutRunner(config, self._env, self._env_eval, self._agent) # setup wandb if self._is_chef and self._config.is_train and self._config.wandb: exclude = ["device"] if config.debug: os.environ["WANDB_MODE"] = "dryrun" tags = [config.env, config.algo, config.reward_type] assert (config.entity != None and config.project != None ), "Entity and Project name must be specified" wandb.init( resume=config.run_name, project=config.project, config={ k: v for k, v in config.__dict__.items() if k not in exclude }, dir=config.log_dir, entity=config.entity, notes=config.notes, tags=tags, group=config.group, )
def _update_network(self, transitions, step=0): info = {} # pre-process observations _to_tensor = lambda x: to_tensor(x, self._config.device) o, o_next = transitions["ob"], transitions["ob_next"] bs = len(transitions["done"]) o = _to_tensor(o) o_next = _to_tensor(o_next) ac = _to_tensor(transitions["ac"]) if "intra_steps" in transitions.keys( ) and self._config.use_smdp_update: intra_steps = _to_tensor(transitions["intra_steps"]) done = _to_tensor(transitions["done"]).reshape(bs, 1) rew = _to_tensor(transitions["rew"]).reshape(bs, 1) actions_real, log_pi = self.act_log(o) alpha_loss = -(self._log_alpha.exp() * (log_pi + self._target_entropy).detach()).mean() self._alpha_optim.zero_grad() alpha_loss.backward() self._alpha_optim.step() alpha = self._log_alpha.exp() info["alpha_loss"] = alpha_loss.cpu().item() info["entropy_alpha"] = alpha.cpu().item() alpha = self._log_alpha.exp() # the actor loss entropy_loss = (alpha * log_pi).mean() actor_loss = -torch.min(self._critic1(o, actions_real), self._critic2(o, actions_real)).mean() info["log_pi"] = log_pi.mean().cpu().item() info["entropy_loss"] = entropy_loss.cpu().item() info["actor_loss"] = actor_loss.cpu().item() actor_loss += entropy_loss # calculate the target Q value function with torch.no_grad(): actions_next, log_pi_next = self.act_log(o_next) q_next_value1 = self._critic1_target(o_next, actions_next) q_next_value2 = self._critic2_target(o_next, actions_next) q_next_value = torch.min(q_next_value1, q_next_value2) - alpha * log_pi_next if self._config.use_smdp_update: target_q_value = (self._config.reward_scale * rew + (1 - done) * (self._config.discount_factor** (intra_steps + 1)) * q_next_value) else: target_q_value = ( self._config.reward_scale * rew + (1 - done) * self._config.discount_factor * q_next_value) target_q_value = target_q_value.detach() # the q loss for k, space in self._ac_space.spaces.items(): if isinstance(space, spaces.Discrete): ac[k] = (F.one_hot(ac[k].long(), action_size( self._ac_space[k])).float().squeeze(1)) real_q_value1 = self._critic1(o, ac) real_q_value2 = self._critic2(o, ac) critic1_loss = 0.5 * (target_q_value - real_q_value1).pow(2).mean() critic2_loss = 0.5 * (target_q_value - real_q_value2).pow(2).mean() info["min_target_q"] = target_q_value.min().cpu().item() info["target_q"] = target_q_value.mean().cpu().item() info["min_real1_q"] = real_q_value1.min().cpu().item() info["min_real2_q"] = real_q_value2.min().cpu().item() info["real1_q"] = real_q_value1.mean().cpu().item() info["real2_q"] = real_q_value2.mean().cpu().item() info["critic1_loss"] = critic1_loss.cpu().item() info["critic2_loss"] = critic2_loss.cpu().item() # update the actor self._actor_optim.zero_grad() actor_loss.backward() if self._config.is_mpi: sync_grads(self._actor) self._actor_optim.step() # update the critic self._critic1_optim.zero_grad() critic1_loss.backward() if self._config.is_mpi: sync_grads(self._critic1) self._critic1_optim.step() self._critic2_optim.zero_grad() critic2_loss.backward() if self._config.is_mpi: sync_grads(self, _critic2) self._critic2_optim.step() if self._config.is_mpi: return mpi_average(info) else: return info
def __init__( self, config, ob_space, ac_space, actor, critic, non_limited_idx=None, ref_joint_pos_indexes=None, joint_space=None, is_jnt_limited=None, jnt_indices=None, ): super().__init__(config, ob_space) self._ob_space = ob_space self._ac_space = ac_space self._jnt_indices = jnt_indices self._ref_joint_pos_indexes = ref_joint_pos_indexes self._log_alpha = torch.tensor(np.log(config.alpha), requires_grad=True, device=config.device) self._alpha_optim = optim.Adam([self._log_alpha], lr=config.lr_actor) self._joint_space = joint_space self._is_jnt_limited = is_jnt_limited if joint_space is not None: self._jnt_minimum = joint_space["default"].low self._jnt_maximum = joint_space["default"].high # build up networks self._build_actor(actor) self._build_critic(critic) self._network_cuda(config.device) self._target_entropy = -action_size(self._actor._ac_space) self._actor_optim = optim.Adam(self._actor.parameters(), lr=config.lr_actor) self._critic1_optim = optim.Adam(self._critic1.parameters(), lr=config.lr_critic) self._critic2_optim = optim.Adam(self._critic2.parameters(), lr=config.lr_critic) sampler = RandomSampler() buffer_keys = ["ob", "ac", "meta_ac", "done", "rew"] if config.mopa or config.expand_ac_space: buffer_keys.append("intra_steps") self._buffer = ReplayBuffer(buffer_keys, config.buffer_size, sampler.sample_func) self._log_creation() self._planner = None self._is_planner_initialized = False if config.mopa: self._planner = PlannerAgent( config, ac_space, non_limited_idx, planner_type=config.planner_type, passive_joint_idx=config.passive_joint_idx, ignored_contacts=config.ignored_contact_geom_ids, is_simplified=config.is_simplified, simplified_duration=config.simplified_duration, range_=config.range, ) self._simple_planner = PlannerAgent( config, ac_space, non_limited_idx, planner_type=config.simple_planner_type, passive_joint_idx=config.passive_joint_idx, ignored_contacts=config.ignored_contact_geom_ids, goal_bias=1.0, is_simplified=config.simple_planner_simplified, simplified_duration=config.simple_planner_simplified_duration, range_=config.simple_planner_range, ) self._omega = config.omega