def _init_distribution(conditions, **kwargs): loc, scale = conditions["loc"], conditions["scale"] return tfd.Gumbel(loc=loc, scale=scale, **kwargs)
def __init__(self, temp): super().__init__() self.temp = temp self.gumbel = distributions.Gumbel(0, 1)
def __init__( self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, ployak=0.995, high_scale=1.0, reward_scale=1.0, sample_g_nums=100, sub_goal_steps=10, fn_goal_dim=0, intrinsic_reward_mode='os', high_batch_size=256, high_buffer_size=100000, low_batch_size=8, low_buffer_size=10000, high_actor_lr=1.0e-4, high_critic_lr=1.0e-3, low_actor_lr=1.0e-4, low_critic_lr=1.0e-3, hidden_units={ 'high_actor': [64, 64], 'high_critic': [64, 64], 'low_actor': [64, 64], 'low_critic': [64, 64] }, **kwargs): assert visual_sources == 0, 'HIRO doesn\'t support visual inputs.' super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.data_high = ExperienceReplay(high_batch_size, high_buffer_size) self.data_low = ExperienceReplay(low_batch_size, low_buffer_size) self.ployak = ployak self.high_scale = np.array( high_scale if isinstance(high_scale, list) else [high_scale] * self.s_dim, dtype=np.float32) self.reward_scale = reward_scale self.fn_goal_dim = fn_goal_dim self.sample_g_nums = sample_g_nums self.sub_goal_steps = sub_goal_steps self.sub_goal_dim = self.s_dim - self.fn_goal_dim self.high_noise = rls.ClippedNormalActionNoise( mu=np.zeros(self.sub_goal_dim), sigma=self.high_scale * np.ones(self.sub_goal_dim), bound=self.high_scale / 2) self.low_noise = rls.ClippedNormalActionNoise(mu=np.zeros(self.a_dim), sigma=1.0 * np.ones(self.a_dim), bound=0.5) _high_actor_net = lambda: rls.actor_dpg(self.s_dim, self.sub_goal_dim, hidden_units['high_actor']) if self.is_continuous: _low_actor_net = lambda: rls.actor_dpg( self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[ 'low_actor']) else: _low_actor_net = lambda: rls.actor_discrete( self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[ 'low_actor']) self.gumbel_dist = tfd.Gumbel(0, 1) self.high_actor = _high_actor_net() self.high_actor_target = _high_actor_net() self.low_actor = _low_actor_net() self.low_actor_target = _low_actor_net() _high_critic_net = lambda: rls.critic_q_one( self.s_dim, self.sub_goal_dim, hidden_units['high_critic']) _low_critic_net = lambda: rls.critic_q_one( self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[ 'low_critic']) self.high_critic = DoubleQ(_high_critic_net) self.high_critic_target = DoubleQ(_high_critic_net) self.low_critic = DoubleQ(_low_critic_net) self.low_critic_target = DoubleQ(_low_critic_net) self.update_target_net_weights( self.low_actor_target.weights + self.low_critic_target.weights + self.high_actor_target.weights + self.high_critic_target.weights, self.low_actor.weights + self.low_critic.weights + self.high_actor.weights + self.high_critic.weights) self.low_actor_lr, self.low_critic_lr = map( self.init_lr, [low_actor_lr, low_critic_lr]) self.high_actor_lr, self.high_critic_lr = map( self.init_lr, [high_actor_lr, high_critic_lr]) self.low_actor_optimizer, self.low_critic_optimizer = map( self.init_optimizer, [self.low_actor_lr, self.low_critic_lr]) self.high_actor_optimizer, self.high_critic_optimizer = map( self.init_optimizer, [self.high_actor_lr, self.high_critic_lr]) self.model_recorder( dict(high_actor=self.high_actor, high_critic=self.high_critic, low_actor=self.low_actor, low_critic=self.low_critic, low_actor_optimizer=self.low_actor_optimizer, low_critic_optimizer=self.low_critic_optimizer, high_actor_optimizer=self.high_actor_optimizer, high_critic_optimizer=self.high_critic_optimizer)) self.counts = 0 self._high_s = [[] for _ in range(self.n_agents)] self._noop_subgoal = np.random.uniform(-self.high_scale, self.high_scale, size=(self.n_agents, self.sub_goal_dim)) self.get_ir = self.generate_ir_func(mode=intrinsic_reward_mode)
def _base_dist(self, mu: TensorLike, beta: TensorLike, *args, **kwargs): return tfd.Gumbel(loc=mu, scale=beta, *args, **kwargs)
def __init__( self, envspec, ployak=0.995, high_scale=1.0, reward_scale=1.0, sample_g_nums=100, sub_goal_steps=10, fn_goal_dim=0, intrinsic_reward_mode='os', high_batch_size=256, high_buffer_size=100000, low_batch_size=8, low_buffer_size=10000, high_actor_lr=1.0e-4, high_critic_lr=1.0e-3, low_actor_lr=1.0e-4, low_critic_lr=1.0e-3, network_settings={ 'high_actor': [64, 64], 'high_critic': [64, 64], 'low_actor': [64, 64], 'low_critic': [64, 64] }, **kwargs): assert not envspec.obs_spec.has_visual_observation, 'HIRO doesn\'t support visual inputs.' super().__init__(envspec=envspec, **kwargs) self.concat_vector_dim = self.obs_spec.total_vector_dim self.data_high = ExperienceReplay(high_batch_size, high_buffer_size) self.data_low = ExperienceReplay(low_batch_size, low_buffer_size) self.ployak = ployak self.reward_scale = reward_scale self.fn_goal_dim = fn_goal_dim self.sample_g_nums = sample_g_nums self.sub_goal_steps = sub_goal_steps self.sub_goal_dim = self.concat_vector_dim - self.fn_goal_dim self.high_scale = np.array( high_scale if isinstance(high_scale, list) else [high_scale] * self.sub_goal_dim, dtype=np.float32) self.high_noised_action = ClippedNormalNoisedAction( mu=np.zeros(self.sub_goal_dim), sigma=self.high_scale * np.ones(self.sub_goal_dim), action_bound=self.high_scale, noise_bound=self.high_scale / 2) self.low_noised_action = ClippedNormalNoisedAction( mu=np.zeros(self.a_dim), sigma=1.0 * np.ones(self.a_dim), noise_bound=0.5) def _create_high_ac_net(name): return ADoubleCNetwork( name=name, policy_net_type=OutputNetworkType.ACTOR_DPG, policy_net_kwargs=dict( vector_dim=self.concat_vector_dim, output_shape=self.sub_goal_dim, network_settings=network_settings['high_actor']), value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE, value_net_kwargs=dict( vector_dim=self.concat_vector_dim, action_dim=self.sub_goal_dim, network_settings=network_settings['high_critic'])) self.high_ac_net = _create_high_ac_net('high_ac_net') self.high_ac_target_net = _create_high_ac_net('high_ac_target_net') if self.is_continuous: def _create_low_ac_net(name): return ADoubleCNetwork( name=name, policy_net_type=OutputNetworkType.ACTOR_DPG, policy_net_kwargs=dict( vector_dim=self.concat_vector_dim + self.sub_goal_dim, output_shape=self.a_dim, network_settings=network_settings['low_actor']), value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE, value_net_kwargs=dict( vector_dim=self.concat_vector_dim + self.sub_goal_dim, action_dim=self.a_dim, network_settings=network_settings['low_critic'])) else: def _create_low_ac_net(name): return ADoubleCNetwork( name=name, policy_net_type=OutputNetworkType.ACTOR_DCT, policy_net_kwargs=dict( vector_dim=self.concat_vector_dim + self.sub_goal_dim, output_shape=self.a_dim, network_settings=network_settings['low_actor']), value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE, value_net_kwargs=dict( vector_dim=self.concat_vector_dim + self.sub_goal_dim, action_dim=self.a_dim, network_settings=network_settings['low_critic'])) self.gumbel_dist = tfd.Gumbel(0, 1) self.low_ac_net = _create_low_ac_net('low_ac_net') self.low_ac_target_net = _create_low_ac_net('low_ac_target_net') update_target_net_weights( self.low_ac_target_net.weights + self.high_ac_target_net.weights, self.low_ac_net.weights + self.high_ac_net.weights) self.low_actor_lr, self.low_critic_lr = map( self.init_lr, [low_actor_lr, low_critic_lr]) self.high_actor_lr, self.high_critic_lr = map( self.init_lr, [high_actor_lr, high_critic_lr]) self.low_actor_optimizer, self.low_critic_optimizer = map( self.init_optimizer, [self.low_actor_lr, self.low_critic_lr]) self.high_actor_optimizer, self.high_critic_optimizer = map( self.init_optimizer, [self.high_actor_lr, self.high_critic_lr]) self.counts = 0 self._high_s = [[] for _ in range(self.n_agents)] self._noop_subgoal = np.random.uniform(-self.high_scale, self.high_scale, size=(self.n_agents, self.sub_goal_dim)) self.get_ir = self.generate_ir_func(mode=intrinsic_reward_mode) self._worker_params_dict.update(self.high_ac_net._policy_models) self._worker_params_dict.update(self.low_ac_net._policy_models) self._all_params_dict.update(self.high_ac_net._all_models) self._all_params_dict.update(self.low_ac_net._all_models) self._all_params_dict.update( low_actor_optimizer=self.low_actor_optimizer, low_critic_optimizer=self.low_critic_optimizer, high_actor_optimizer=self.high_actor_optimizer, high_critic_optimizer=self.high_critic_optimizer) self._model_post_process()
def call(self, prob): gumbel_dist = tfp.Gumbel(0, 1) gumbel_sample = gumbel_dist.sample(K.shape(prob)) categ_sample = K.exp((K.log(prob)+gumbel_sample)/self.temp) categ_sample = categ_sample/K.sum(categ_sample, axis=-1, keepdims=True) return categ_sample