def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, epoch=5, beta=1.0e-3, actor_lr=5.0e-4, critic_lr=1.0e-3, hidden_units={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'critic': [32, 32] }, **kwargs): super().__init__( s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.beta = beta self.epoch = epoch # self.TensorSpecs = get_TensorSpecs([self.s_dim], self.visual_dim, [self.a_dim], [1]) if self.is_continuous: self.actor_net = ActorCts(self.feat_dim, self.a_dim, hidden_units['actor_continuous']) self.log_std = tf.Variable(initial_value=-0.5 * np.ones(self.a_dim, dtype=np.float32), trainable=True) self.actor_tv = self.actor_net.trainable_variables + [self.log_std] else: self.actor_net = ActorDcs(self.feat_dim, self.a_dim, hidden_units['actor_discrete']) self.actor_tv = self.actor_net.trainable_variables self.critic_net = Critic(self.feat_dim, hidden_units['critic']) self.critic_tv = self.critic_net.trainable_variables + self.other_tv self.actor_lr, self.critic_lr = map(self.init_lr, [actor_lr, critic_lr]) self.optimizer_actor, self.optimizer_critic = map(self.init_optimizer, [self.actor_lr, self.critic_lr]) self.model_recorder(dict( actor=self.actor_net, critic=self.critic_net, optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic )) self.initialize_data_buffer()
def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, actor_lr=5.0e-4, critic_lr=1.0e-3, discrete_tau=1.0, hidden_units={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'q': [32, 32] }, **kwargs): super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.discrete_tau = discrete_tau if self.is_continuous: # self.action_noise = NormalActionNoise(mu=np.zeros(self.a_dim), sigma=1 * np.ones(self.a_dim)) self.action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(self.a_dim), sigma=0.2 * np.ones(self.a_dim)) self.actor_net = ActorCts(self.feat_dim, self.a_dim, hidden_units['actor_continuous']) else: self.actor_net = ActorDcs(self.feat_dim, self.a_dim, hidden_units['actor_discrete']) self.gumbel_dist = tfp.distributions.Gumbel(0, 1) self.actor_tv = self.actor_net.trainable_variables self.q_net = Critic(self.feat_dim, self.a_dim, hidden_units['q']) self.critic_tv = self.q_net.trainable_variables + self.other_tv self.actor_lr, self.critic_lr = map(self.init_lr, [actor_lr, critic_lr]) self.optimizer_actor, self.optimizer_critic = map( self.init_optimizer, [self.actor_lr, self.critic_lr]) self.model_recorder( dict(actor=self.actor_net, critic=self.q_net, optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic))
def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, lr=5.0e-4, epoch=5, hidden_units={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32] }, **kwargs): super().__init__( s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.epoch = epoch # self.TensorSpecs = get_TensorSpecs([self.s_dim], self.visual_dim, [self.a_dim], [1]) if self.is_continuous: self.net = ActorCts(self.feat_dim, self.a_dim, hidden_units['actor_continuous']) self.log_std = tf.Variable(initial_value=-0.5 * np.ones(self.a_dim, dtype=np.float32), trainable=True) self.net_tv = self.net.trainable_variables + [self.log_std] + self.other_tv else: self.net = ActorDcs(self.feat_dim, self.a_dim, hidden_units['actor_discrete']) self.net_tv = self.net.trainable_variables + self.other_tv self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self.model_recorder(dict( model=self.net, optimizer=self.optimizer )) self.initialize_data_buffer()
def __init__( self, s_dim: Union[int, np.ndarray], visual_sources: Union[int, np.ndarray], visual_resolution: Union[List, np.ndarray], a_dim: Union[int, np.ndarray], is_continuous: Union[bool, np.ndarray], policy_epoch: int = 4, value_epoch: int = 4, beta: float = 1.0e-3, lr: float = 5.0e-4, lambda_: float = 0.95, epsilon: float = 0.2, value_epsilon: float = 0.2, share_net: bool = True, actor_lr: float = 3e-4, critic_lr: float = 1e-3, kl_reverse: bool = False, kl_target: float = 0.02, kl_target_cutoff: float = 2, kl_target_earlystop: float = 4, kl_beta: List[float] = [0.7, 1.3], kl_alpha: float = 1.5, kl_coef: float = 1.0, hidden_units: Dict = { 'share': { 'continuous': { 'share': [32, 32], 'mu': [32, 32], 'v': [32, 32] }, 'discrete': { 'share': [32, 32], 'logits': [32, 32], 'v': [32, 32] } }, 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'critic': [32, 32] }, **kwargs): super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.beta = beta self.policy_epoch = policy_epoch self.value_epoch = value_epoch self.lambda_ = lambda_ self.epsilon = epsilon self.value_epsilon = value_epsilon self.share_net = share_net self.kl_reverse = kl_reverse self.kl_target = kl_target self.kl_alpha = kl_alpha self.kl_coef = tf.constant(kl_coef, dtype=tf.float32) self.kl_cutoff = kl_target * kl_target_cutoff self.kl_stop = kl_target * kl_target_earlystop self.kl_low = kl_target * kl_beta[0] self.kl_high = kl_target * kl_beta[-1] if self.is_continuous: self.log_std = tf.Variable(initial_value=-0.5 * np.ones(self.a_dim, dtype=np.float32), trainable=True) if self.share_net: # self.TensorSpecs = get_TensorSpecs([self.s_dim], self.visual_dim, [self.a_dim], [1], [1], [1]) if self.is_continuous: self.net = ACCtsShare(self.feat_dim, self.a_dim, hidden_units['share']['continuous']) self.net_tv = self.net.trainable_variables + [self.log_std ] + self.other_tv else: self.net = ACDcsShare(self.feat_dim, self.a_dim, hidden_units['share']['discrete']) self.net_tv = self.net.trainable_variables + self.other_tv self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self.model_recorder(dict(model=self.net, optimizer=self.optimizer)) else: # self.actor_TensorSpecs = get_TensorSpecs([self.s_dim], self.visual_dim, [self.a_dim], [1], [1]) # self.critic_TensorSpecs = get_TensorSpecs([self.s_dim], self.visual_dim, [1]) if self.is_continuous: self.actor_net = ActorCts(self.feat_dim, self.a_dim, hidden_units['actor_continuous']) self.actor_net_tv = self.actor_net.trainable_variables + [ self.log_std ] else: self.actor_net = ActorDcs(self.feat_dim, self.a_dim, hidden_units['actor_discrete']) self.actor_net_tv = self.actor_net.trainable_variables self.critic_net = Critic(self.feat_dim, hidden_units['critic']) self.critic_tv = self.critic_net.trainable_variables + self.other_tv self.actor_lr, self.critic_lr = map(self.init_lr, [actor_lr, critic_lr]) self.optimizer_actor, self.optimizer_critic = map( self.init_optimizer, [self.actor_lr, self.critic_lr]) self.model_recorder( dict(actor=self.actor_net, critic=self.critic_net, optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic)) self.initialize_data_buffer(data_name_list=[ 's', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'value', 'log_prob' ])
def _actor_net(i): return ActorCts(self.s_dim[i], self.a_dim[i], hidden_units['actor']) self.actor_nets = {i: _actor_net(i) for i in range(self.agent_sep_ctls)}
def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, beta=1.0e-3, lr=5.0e-4, delta=0.01, lambda_=0.95, cg_iters=10, train_v_iters=10, damping_coeff=0.1, backtrack_iters=10, backtrack_coeff=0.8, epsilon=0.2, critic_lr=1e-3, hidden_units={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'critic': [32, 32] }, **kwargs): super().__init__( s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.beta = beta self.delta = delta self.lambda_ = lambda_ self.epsilon = epsilon self.cg_iters = cg_iters self.damping_coeff = damping_coeff self.backtrack_iters = backtrack_iters self.backtrack_coeff = backtrack_coeff self.train_v_iters = train_v_iters # self.actor_TensorSpecs = get_TensorSpecs([self.s_dim], self.visual_dim, [self.a_dim], [1], [1]) # self.critic_TensorSpecs = get_TensorSpecs([self.s_dim], self.visual_dim, [1]) if self.is_continuous: self.actor_net = ActorCts(self.feat_dim, self.a_dim, hidden_units['actor_continuous']) self.log_std = tf.Variable(initial_value=-0.5 * np.ones(self.a_dim, dtype=np.float32), trainable=True) self.actor_tv = self.actor_net.trainable_variables + [self.log_std] # self.Hx_TensorSpecs = [tf.TensorSpec(shape=flat_concat(self.actor_tv).shape, dtype=tf.float32)] \ # + get_TensorSpecs([self.s_dim], self.visual_dim, [self.a_dim], [self.a_dim]) else: self.actor_net = ActorDcs(self.feat_dim, self.a_dim, hidden_units['actor_discrete']) self.actor_tv = self.actor_net.trainable_variables # self.Hx_TensorSpecs = [tf.TensorSpec(shape=flat_concat(self.actor_tv).shape, dtype=tf.float32)] \ # + get_TensorSpecs([self.s_dim], self.visual_dim, [self.a_dim]) self.critic_net = Critic(self.feat_dim, hidden_units['critic']) self.critic_tv = self.critic_net.trainable_variables + self.other_tv self.critic_lr = self.init_lr(critic_lr) self.optimizer_critic = self.init_optimizer(self.critic_lr) self.model_recorder(dict( actor=self.actor_net, critic=self.critic_net, optimizer_critic=self.optimizer_critic )) if self.is_continuous: data_name_list = ['s', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'value', 'log_prob', 'old_mu', 'old_log_std'] else: data_name_list = ['s', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'value', 'log_prob', 'old_logp_all'] self.initialize_data_buffer( data_name_list=data_name_list)
def _actor_net(i): return ActorCts(self.s_dim[i], self.a_dim[i], network_settings['actor'])
def __init__( self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, alpha=0.2, annealing=True, last_alpha=0.01, ployak=0.995, discrete_tau=1.0, log_std_bound=[-20, 2], hidden_units={ 'actor_continuous': { 'share': [128, 128], 'mu': [64], 'log_std': [64] }, 'actor_discrete': [64, 32], 'q': [128, 128], 'encoder': 128 }, auto_adaption=True, actor_lr=5.0e-4, critic_lr=1.0e-3, alpha_lr=5.0e-4, curl_lr=5.0e-4, img_size=64, **kwargs): super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) assert self.visual_sources == 1 self.ployak = ployak self.discrete_tau = discrete_tau self.log_std_min, self.log_std_max = log_std_bound[:] self.auto_adaption = auto_adaption self.annealing = annealing self.img_size = img_size self.img_dim = [img_size, img_size, self.visual_dim[-1]] self.vis_feat_size = hidden_units['encoder'] if self.auto_adaption: self.log_alpha = tf.Variable(initial_value=0.0, name='log_alpha', dtype=tf.float32, trainable=True) else: self.log_alpha = tf.Variable(initial_value=tf.math.log(alpha), name='log_alpha', dtype=tf.float32, trainable=False) if self.annealing: self.alpha_annealing = LinearAnnealing(alpha, last_alpha, 1.0e6) if self.is_continuous: self.actor_net = ActorCts(self.s_dim + self.vis_feat_size, self.a_dim, hidden_units['actor_continuous']) else: self.actor_net = ActorDcs(self.s_dim + self.vis_feat_size, self.a_dim, hidden_units['actor_discrete']) self.gumbel_dist = tfp.distributions.Gumbel(0, 1) self.actor_tv = self.actor_net.trainable_variables # entropy = -log(1/|A|) = log |A| self.target_entropy = 0.98 * (-self.a_dim if self.is_continuous else np.log(self.a_dim)) def _q_net(): return Critic(self.s_dim + self.vis_feat_size, self.a_dim, hidden_units['q']) self.critic_net = DoubleQ(_q_net) self.critic_target_net = DoubleQ(_q_net) self.encoder = VisualEncoder(self.img_dim, hidden_units['encoder']) self.encoder_target = VisualEncoder(self.img_dim, hidden_units['encoder']) self.curl_w = tf.Variable( initial_value=tf.random.normal(shape=(self.vis_feat_size, self.vis_feat_size)), name='curl_w', dtype=tf.float32, trainable=True) self.critic_tv = self.critic_net.trainable_variables + self.encoder.trainable_variables update_target_net_weights( self.critic_target_net.weights + self.encoder_target.trainable_variables, self.critic_net.weights + self.encoder.trainable_variables) self.actor_lr, self.critic_lr, self.alpha_lr, self.curl_lr = map( self.init_lr, [actor_lr, critic_lr, alpha_lr, curl_lr]) self.optimizer_actor, self.optimizer_critic, self.optimizer_alpha, self.optimizer_curl = map( self.init_optimizer, [self.actor_lr, self.critic_lr, self.alpha_lr, self.curl_lr]) self.model_recorder( dict( actor=self.actor_net, critic_net=self.critic_net, curl_w=self.curl_w, optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic, optimizer_alpha=self.optimizer_alpha, optimizer_curl=self.optimizer_curl, ))
def __init__( self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, alpha=0.2, annealing=True, last_alpha=0.01, ployak=0.995, entropic_index=1.5, discrete_tau=1.0, log_std_bound=[-20, 2], hidden_units={ 'actor_continuous': { 'share': [128, 128], 'mu': [64], 'log_std': [64] }, 'actor_discrete': [64, 32], 'q': [128, 128] }, auto_adaption=True, actor_lr=5.0e-4, critic_lr=1.0e-3, alpha_lr=5.0e-4, **kwargs): super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.ployak = ployak self.discrete_tau = discrete_tau self.entropic_index = 2 - entropic_index self.log_std_min, self.log_std_max = log_std_bound[:] self.auto_adaption = auto_adaption self.annealing = annealing if self.auto_adaption: self.log_alpha = tf.Variable(initial_value=0.0, name='log_alpha', dtype=tf.float32, trainable=True) else: self.log_alpha = tf.Variable(initial_value=tf.math.log(alpha), name='log_alpha', dtype=tf.float32, trainable=False) if self.annealing: self.alpha_annealing = LinearAnnealing(alpha, last_alpha, 1e6) if self.is_continuous: self.actor_net = ActorCts(self.feat_dim, self.a_dim, hidden_units['actor_continuous']) else: self.actor_net = ActorDcs(self.feat_dim, self.a_dim, hidden_units['actor_discrete']) self.gumbel_dist = tfp.distributions.Gumbel(0, 1) self.actor_tv = self.actor_net.trainable_variables # entropy = -log(1/|A|) = log |A| self.target_entropy = 0.98 * (-self.a_dim if self.is_continuous else np.log(self.a_dim)) def _q_net(): return CriticQ1(self.feat_dim, self.a_dim, hidden_units['q']) self.critic_net = DoubleQ(_q_net) self.critic_target_net = DoubleQ(_q_net) self.critic_tv = self.critic_net.trainable_variables + self.other_tv update_target_net_weights(self.critic_target_net.weights, self.critic_net.weights) self.actor_lr, self.critic_lr, self.alpha_lr = map( self.init_lr, [actor_lr, critic_lr, alpha_lr]) self.optimizer_actor, self.optimizer_critic, self.optimizer_alpha = map( self.init_optimizer, [self.actor_lr, self.critic_lr, self.alpha_lr]) self.model_recorder( dict( actor=self.actor_net, critic_net=self.critic_net, log_alpha=self.log_alpha, optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic, optimizer_alpha=self.optimizer_alpha, ))
def _actor_net(): return ActorCts(self.feat_dim, self.a_dim, hidden_units['actor_continuous'])
def _actor_net(): return ActorCts(self.feat_dim, self.a_dim, hidden_units['actor_continuous']) # self.action_noise = NormalActionNoise(mu=np.zeros(self.a_dim), sigma=1 * np.ones(self.a_dim)) self.action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.a_dim), sigma=0.2 * np.ones(self.a_dim))
def _low_actor_net(): return ActorCts(self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units['low_actor'])
def _high_actor_net(): return ActorCts(self.s_dim, self.sub_goal_dim, hidden_units['high_actor'])