def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, ployak=0.995, actor_lr=5.0e-4, critic_lr=1.0e-3, discrete_tau=1.0, hidden_units={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'q': [32, 32] }, **kwargs): super().__init__( s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.ployak = ployak self.discrete_tau = discrete_tau if self.is_continuous: _actor_net = lambda: rls.actor_dpg(self.feat_dim, self.a_dim, hidden_units['actor_continuous']) # self.action_noise = rls.NormalActionNoise(mu=np.zeros(self.a_dim), sigma=1 * np.ones(self.a_dim)) self.action_noise = rls.OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.a_dim), sigma=0.2 * np.exp(-self.episode / 10) * np.ones(self.a_dim)) else: _actor_net = lambda: rls.actor_discrete(self.feat_dim, self.a_dim, hidden_units['actor_discrete']) self.gumbel_dist = tfp.distributions.Gumbel(0, 1) self.actor_net = _actor_net() self.actor_target_net = _actor_net() self.actor_tv = self.actor_net.trainable_variables _q_net = lambda : rls.critic_q_one(self.feat_dim, self.a_dim, hidden_units['q']) self.q_net = _q_net() self.q_target_net = _q_net() self.critic_tv = self.q_net.trainable_variables + self.other_tv self.update_target_net_weights( self.actor_target_net.weights + self.q_target_net.weights, self.actor_net.weights + self.q_net.weights ) self.actor_lr, self.critic_lr = map(self.init_lr, [actor_lr, critic_lr]) self.optimizer_actor, self.optimizer_critic = map(self.init_optimizer, [self.actor_lr, self.critic_lr]) self.model_recorder(dict( actor=self.actor_net, critic=self.q_net, optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic ))
def __init__(self, s_dim, a_dim, is_continuous, ployak=0.995, actor_lr=5.0e-4, critic_lr=1.0e-3, n=1, i=0, hidden_units={ 'actor': [32, 32], 'q': [32, 32] }, **kwargs): assert is_continuous, 'matd3 only support continuous action space' raise Exception('MA系列存在问题,还未修复') super().__init__(s_dim=s_dim, visual_sources=0, visual_resolution=0, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.n = n self.i = i self.ployak = ployak # self.action_noise = rls.NormalActionNoise(mu=np.zeros(self.a_dim), sigma=1 * np.ones(self.a_dim)) self.action_noise = rls.OrnsteinUhlenbeckActionNoise( mu=np.zeros(self.a_dim), sigma=0.2 * np.exp(-self.episode / 10) * np.ones(self.a_dim)) _actor_net = lambda: rls.actor_dpg(self.s_dim, 0, self.a_dim, hidden_units['actor']) self.actor_net = _actor_net() self.actor_target_net = _actor_net() _q_net = lambda: rls.critic_q_one( (self.s_dim) * self.n, 0, (self.a_dim) * self.n, hidden_units['q']) self.critic_net = DoubleQ(_q_net) self.critic_target_net = DoubleQ(_q_net) self.update_target_net_weights( self.actor_target_net.weights + self.critic_target_net.weights, self.actor_net.weights + self.critic_net.weights) self.actor_lr, self.critic_lr = map(self.init_lr, [actor_lr, critic_lr]) self.optimizer_actor, self.optimizer_critic = map( self.init_optimizer, [self.actor_lr, self.critic_lr]) self.model_recorder( dict(actor=self.actor_net, critic_net=self.critic_net, optimizer_critic=self.optimizer_critic, optimizer_actor=self.optimizer_actor)) self.recorder.logger.info(self.action_noise)
def _actor_net(): return rls.actor_dpg(self.s_dim, 0, self.a_dim, hidden_units['actor'])
def _actor_net(): return rls.actor_dpg(self.feat_dim, self.a_dim, hidden_units['actor_continuous']) # self.action_noise = rls.NormalActionNoise(mu=np.zeros(self.a_dim), sigma=1 * np.ones(self.a_dim)) self.action_noise = rls.OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.a_dim), sigma=0.2 * np.ones(self.a_dim))
def _actor_net(): return rls.actor_dpg(self.feat_dim, self.a_dim, hidden_units['actor_continuous'])
def __init__( self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, ployak=0.995, high_scale=1.0, reward_scale=1.0, sample_g_nums=100, sub_goal_steps=10, fn_goal_dim=0, intrinsic_reward_mode='os', high_batch_size=256, high_buffer_size=100000, low_batch_size=8, low_buffer_size=10000, high_actor_lr=1.0e-4, high_critic_lr=1.0e-3, low_actor_lr=1.0e-4, low_critic_lr=1.0e-3, hidden_units={ 'high_actor': [64, 64], 'high_critic': [64, 64], 'low_actor': [64, 64], 'low_critic': [64, 64] }, **kwargs): assert visual_sources == 0, 'HIRO doesn\'t support visual inputs.' super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.data_high = ExperienceReplay(high_batch_size, high_buffer_size) self.data_low = ExperienceReplay(low_batch_size, low_buffer_size) self.ployak = ployak self.high_scale = np.array( high_scale if isinstance(high_scale, list) else [high_scale] * self.s_dim, dtype=np.float32) self.reward_scale = reward_scale self.fn_goal_dim = fn_goal_dim self.sample_g_nums = sample_g_nums self.sub_goal_steps = sub_goal_steps self.sub_goal_dim = self.s_dim - self.fn_goal_dim self.high_noise = rls.ClippedNormalActionNoise( mu=np.zeros(self.sub_goal_dim), sigma=self.high_scale * np.ones(self.sub_goal_dim), bound=self.high_scale / 2) self.low_noise = rls.ClippedNormalActionNoise(mu=np.zeros(self.a_dim), sigma=1.0 * np.ones(self.a_dim), bound=0.5) _high_actor_net = lambda: rls.actor_dpg(self.s_dim, self.sub_goal_dim, hidden_units['high_actor']) if self.is_continuous: _low_actor_net = lambda: rls.actor_dpg( self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[ 'low_actor']) else: _low_actor_net = lambda: rls.actor_discrete( self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[ 'low_actor']) self.gumbel_dist = tfd.Gumbel(0, 1) self.high_actor = _high_actor_net() self.high_actor_target = _high_actor_net() self.low_actor = _low_actor_net() self.low_actor_target = _low_actor_net() _high_critic_net = lambda: rls.critic_q_one( self.s_dim, self.sub_goal_dim, hidden_units['high_critic']) _low_critic_net = lambda: rls.critic_q_one( self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[ 'low_critic']) self.high_critic = DoubleQ(_high_critic_net) self.high_critic_target = DoubleQ(_high_critic_net) self.low_critic = DoubleQ(_low_critic_net) self.low_critic_target = DoubleQ(_low_critic_net) self.update_target_net_weights( self.low_actor_target.weights + self.low_critic_target.weights + self.high_actor_target.weights + self.high_critic_target.weights, self.low_actor.weights + self.low_critic.weights + self.high_actor.weights + self.high_critic.weights) self.low_actor_lr, self.low_critic_lr = map( self.init_lr, [low_actor_lr, low_critic_lr]) self.high_actor_lr, self.high_critic_lr = map( self.init_lr, [high_actor_lr, high_critic_lr]) self.low_actor_optimizer, self.low_critic_optimizer = map( self.init_optimizer, [self.low_actor_lr, self.low_critic_lr]) self.high_actor_optimizer, self.high_critic_optimizer = map( self.init_optimizer, [self.high_actor_lr, self.high_critic_lr]) self.model_recorder( dict(high_actor=self.high_actor, high_critic=self.high_critic, low_actor=self.low_actor, low_critic=self.low_critic, low_actor_optimizer=self.low_actor_optimizer, low_critic_optimizer=self.low_critic_optimizer, high_actor_optimizer=self.high_actor_optimizer, high_critic_optimizer=self.high_critic_optimizer)) self.counts = 0 self._high_s = [[] for _ in range(self.n_agents)] self._noop_subgoal = np.random.uniform(-self.high_scale, self.high_scale, size=(self.n_agents, self.sub_goal_dim)) self.get_ir = self.generate_ir_func(mode=intrinsic_reward_mode)
def _low_actor_net(): return rls.actor_dpg(self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units['low_actor'])
def _high_actor_net(): return rls.actor_dpg(self.s_dim, self.sub_goal_dim, hidden_units['high_actor'])