def __init__(self, s_dim, a_dim, is_continuous, ployak=0.995, actor_lr=5.0e-4, critic_lr=1.0e-3, n=1, i=0, hidden_units={ 'actor': [32, 32], 'q': [32, 32] }, **kwargs): assert is_continuous, 'maddpg only support continuous action space' raise Exception('MA系列存在问题,还未修复') super().__init__(s_dim=s_dim, visual_sources=0, visual_resolution=0, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.n = n self.i = i self.ployak = ployak self.rnn_net = self._rnn_net(self.visual_net.hdim) # self.action_noise = rls.NormalActionNoise(mu=np.zeros(self.a_dim), sigma=1 * np.ones(self.a_dim)) self.action_noise = rls.OrnsteinUhlenbeckActionNoise( mu=np.zeros(self.a_dim), sigma=0.2 * np.ones(self.a_dim)) def _actor_net(): return rls.actor_dpg(self.s_dim, 0, self.a_dim, hidden_units['actor']) self.actor_net = _actor_net() self.actor_target_net = _actor_net() def _q_net(): return rls.critic_q_one((self.s_dim) * self.n, 0, (self.a_dim) * self.n, hidden_units['q']) self.q_net = _q_net() self.q_target_net = _q_net() self.update_target_net_weights( self.actor_target_net.weights + self.q_target_net.weights, self.actor_net.weights + self.q_net.weights) self.actor_lr, self.critic_lr = map(self.init_lr, [actor_lr, critic_lr]) self.optimizer_actor, self.optimizer_critic = map( self.init_optimizer, [self.actor_lr, self.critic_lr]) self.model_recorder( dict(actor=self.actor_net, q=self.q_net, optimizer_critic=self.optimizer_critic, optimizer_actor=self.optimizer_actor)) self.recorder.logger.info(self.action_noise)
def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, ployak=0.995, actor_lr=5.0e-4, critic_lr=1.0e-3, discrete_tau=1.0, hidden_units={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'q': [32, 32] }, **kwargs): super().__init__( s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.ployak = ployak self.discrete_tau = discrete_tau if self.is_continuous: def _actor_net(): return rls.actor_dpg(self.feat_dim, self.a_dim, hidden_units['actor_continuous']) # self.action_noise = rls.NormalActionNoise(mu=np.zeros(self.a_dim), sigma=1 * np.ones(self.a_dim)) self.action_noise = rls.OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.a_dim), sigma=0.2 * np.ones(self.a_dim)) else: def _actor_net(): return rls.actor_discrete(self.feat_dim, self.a_dim, hidden_units['actor_discrete']) self.gumbel_dist = tfp.distributions.Gumbel(0, 1) self.actor_net = _actor_net() self.actor_target_net = _actor_net() self.actor_tv = self.actor_net.trainable_variables def _q_net(): return rls.critic_q_one(self.feat_dim, self.a_dim, hidden_units['q']) self.q_net = _q_net() self.q_target_net = _q_net() self.critic_tv = self.q_net.trainable_variables + self.other_tv self.update_target_net_weights( self.actor_target_net.weights + self.q_target_net.weights, self.actor_net.weights + self.q_net.weights ) self.actor_lr, self.critic_lr = map(self.init_lr, [actor_lr, critic_lr]) self.optimizer_actor, self.optimizer_critic = map(self.init_optimizer, [self.actor_lr, self.critic_lr]) self.model_recorder(dict( actor=self.actor_net, critic=self.q_net, optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic ))
def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, ployak=0.995, delay_num=2, noise_type='gaussian', gaussian_noise_sigma=0.2, gaussian_noise_bound=0.2, actor_lr=5.0e-4, critic_lr=1.0e-3, discrete_tau=1.0, hidden_units={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'q': [32, 32] }, **kwargs): super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.ployak = ployak self.delay_num = delay_num self.discrete_tau = discrete_tau self.gaussian_noise_sigma = gaussian_noise_sigma self.gaussian_noise_bound = gaussian_noise_bound if self.is_continuous: _actor_net = lambda: rls.actor_dpg( self.feat_dim, self.a_dim, hidden_units['actor_continuous']) if noise_type == 'gaussian': self.action_noise = rls.ClippedNormalActionNoise( mu=np.zeros(self.a_dim), sigma=self.gaussian_noise_sigma * np.ones(self.a_dim), bound=self.gaussian_noise_bound) elif noise_type == 'ou': self.action_noise = rls.OrnsteinUhlenbeckActionNoise( mu=np.zeros(self.a_dim), sigma=0.2 * np.exp(-self.episode / 10) * np.ones(self.a_dim)) else: _actor_net = lambda: rls.actor_discrete( self.feat_dim, self.a_dim, hidden_units['actor_discrete']) self.gumbel_dist = tfp.distributions.Gumbel(0, 1) self.actor_net = _actor_net() self.actor_target_net = _actor_net() self.actor_tv = self.actor_net.trainable_variables _q_net = lambda: rls.critic_q_one(self.feat_dim, self.a_dim, hidden_units['q']) self.critic_net = DoubleQ(_q_net) self.critic_target_net = DoubleQ(_q_net) self.critic_tv = self.critic_net.trainable_variables + self.other_tv self.update_target_net_weights( self.actor_target_net.weights + self.critic_target_net.weights, self.actor_net.weights + self.critic_net.weights) self.actor_lr, self.critic_lr = map(self.init_lr, [actor_lr, critic_lr]) self.optimizer_actor, self.optimizer_critic = map( self.init_optimizer, [self.actor_lr, self.critic_lr]) self.model_recorder( dict(actor=self.actor_net, critic_net=self.critic_net, optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic))
def __init__( self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, ployak=0.995, actor_lr=5.0e-4, reward_critic_lr=1.0e-3, cost_critic_lr=1.0e-3, lambda_lr=5.0e-4, discrete_tau=1.0, cost_constraint=1.0, hidden_units={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'reward': [32, 32], 'cost': [32, 32] }, **kwargs): super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.ployak = ployak self.discrete_tau = discrete_tau self._lambda = tf.Variable(0.0, dtype=tf.float32) self.cost_constraint = cost_constraint # long tern cost <= d if self.is_continuous: _actor_net = lambda: rls.actor_dpg( self.feat_dim, self.a_dim, hidden_units['actor_continuous']) # self.action_noise = rls.NormalActionNoise(mu=np.zeros(self.a_dim), sigma=1 * np.ones(self.a_dim)) self.action_noise = rls.OrnsteinUhlenbeckActionNoise( mu=np.zeros(self.a_dim), sigma=0.2 * np.exp(-self.episode / 10) * np.ones(self.a_dim)) else: _actor_net = lambda: rls.actor_discrete( self.feat_dim, self.a_dim, hidden_units['actor_discrete']) self.gumbel_dist = tfp.distributions.Gumbel(0, 1) self.actor_net = _actor_net() self.actor_target_net = _actor_net() self.actor_tv = self.actor_net.trainable_variables _critic_net = lambda hiddens: rls.critic_q_one(self.feat_dim, self. a_dim, hiddens) self.reward_critic_net = _critic_net(hidden_units['reward']) self.reward_critic_target_net = _critic_net(hidden_units['reward']) self.cost_critic_net = _critic_net(hidden_units['cost']) self.cost_critic_target_net = _critic_net(hidden_units['cost']) self.reward_critic_tv = self.reward_critic_net.trainable_variables + self.other_tv self.update_target_net_weights( self.actor_target_net.weights + self.reward_critic_target_net.weights + self.cost_critic_target_net.weights, self.actor_net.weights + self.reward_critic_net.weights + self.cost_critic_net.weights) self.lambda_lr = lambda_lr self.actor_lr, self.reward_critic_lr, self.cost_critic_lr = map( self.init_lr, [actor_lr, reward_critic_lr, cost_critic_lr]) self.optimizer_actor, self.optimizer_reward_critic, self.optimizer_cost_critic = map( self.init_optimizer, [self.actor_lr, self.reward_critic_lr, self.cost_critic_lr]) self.model_recorder( dict(actor=self.actor_net, reward_critic=self.reward_critic_net, cost_critic=self.cost_critic_net, optimizer_actor=self.optimizer_actor, optimizer_reward_critic=self.optimizer_reward_critic, optimizer_cost_critic=self.optimizer_cost_critic))