class Policy(Base): def __init__(self, s_dim, visual_sources, visual_resolution, a_dim_or_list, action_type, gamma, max_episode, base_dir, policy_mode=None, batch_size=1, buffer_size=1, use_priority=False, n_step=False): super().__init__(a_dim_or_list, action_type, base_dir) self.s_dim = s_dim self.visual_sources = visual_sources self.visual_dim = [visual_sources, *visual_resolution ] if visual_sources else [0] self.a_dim_or_list = a_dim_or_list self.gamma = gamma self.max_episode = max_episode self.policy_mode = policy_mode self.batch_size = batch_size self.buffer_size = buffer_size ''' the biggest diffenernce between policy_modes(ON and OFF) is 'OFF' mode need raise the dimension of 'r' and 'done'. 'ON' mode means program will call on_store function and use pandas dataframe to store data. 'OFF' mode will call off_store function and use replay buffer to store data. ''' if self.policy_mode == 'ON': self.data = pd.DataFrame(columns=['s', 'a', 'r', 's_', 'done']) elif self.policy_mode == 'OFF': if use_priority: if n_step: print('N-Step PER') self.data = NStepPrioritizedExperienceReplay( self.batch_size, self.buffer_size, max_episode=self.max_episode, gamma=self.gamma, alpha=0.6, beta=0.2, epsilon=0.01, agents_num=20, n=4) else: print('PER') self.data = PrioritizedExperienceReplay( self.batch_size, self.buffer_size, max_episode=self.max_episode, alpha=0.6, beta=0.2, epsilon=0.01) else: if n_step: print('N-Step ER') self.data = NStepExperienceReplay(self.batch_size, self.buffer_size, gamma=self.gamma, agents_num=20, n=4) else: print('ER') self.data = ExperienceReplay(self.batch_size, self.buffer_size) else: raise Exception('Please specific a mode of policy!') def on_store(self, s, visual_s, a, r, s_, visual_s_, done): """ for on-policy training, use this function to store <s, a, r, s_, done> into DataFrame of Pandas. """ assert isinstance( a, np.ndarray), "on_store need action type is np.ndarray" assert isinstance( r, np.ndarray), "on_store need reward type is np.ndarray" assert isinstance(done, np.ndarray), "on_store need done type is np.ndarray" self.data = self.data.append( { 's': s, 'visual_s': visual_s, 'a': a, 'r': r, 's_': s_, 'visual_s_': visual_s_, 'done': done }, ignore_index=True) def off_store(self, s, visual_s, a, r, s_, visual_s_, done): """ for off-policy training, use this function to store <s, a, r, s_, done> into ReplayBuffer. """ assert isinstance( a, np.ndarray), "off_store need action type is np.ndarray" assert isinstance( r, np.ndarray), "off_store need reward type is np.ndarray" assert isinstance(done, np.ndarray), "off_store need done type is np.ndarray" self.data.add(s, visual_s, a, r, s_, visual_s_, done) def no_op_store(self, s, visual_s, a, r, s_, visual_s_, done): assert isinstance( a, np.ndarray), "no_op_store need action type is np.ndarray" assert isinstance( r, np.ndarray), "no_op_store need reward type is np.ndarray" assert isinstance( done, np.ndarray), "no_op_store need done type is np.ndarray" if self.policy_mode == 'OFF': self.data.add(s, visual_s, a, r[:, np.newaxis], s_, visual_s_, done[:, np.newaxis]) def clear(self): """ clear the DataFrame. """ self.data.drop(self.data.index, inplace=True) def get_max_episode(self): """ get the max episode of this training model. """ return self.max_episode
class Policy(Base): def __init__(self, a_dim_or_list, action_type, base_dir, s_dim, visual_sources, visual_resolution, gamma, max_episode, policy_mode=None, batch_size=1, buffer_size=1, use_priority=False, n_step=False): super().__init__(a_dim_or_list=a_dim_or_list, action_type=action_type, base_dir=base_dir) self.s_dim = s_dim self.visual_sources = visual_sources self.visual_dim = [visual_sources, *visual_resolution ] if visual_sources else [0] self.a_dim_or_list = a_dim_or_list self.gamma = gamma self.max_episode = max_episode self.policy_mode = policy_mode self.batch_size = batch_size self.buffer_size = buffer_size self.use_priority = use_priority self.n_step = n_step self.init_data_memory() self.init_placeholders() def init_data_memory(self): ''' the biggest diffenernce between policy_modes(ON and OFF) is 'OFF' mode need raise the dimension of 'r' and 'done'. 'ON' mode means program will call on_store function and use pandas dataframe to store data. 'OFF' mode will call off_store function and use replay buffer to store data. ''' if self.policy_mode == 'ON': self.data = pd.DataFrame(columns=['s', 'a', 'r', 'done']) elif self.policy_mode == 'OFF': if self.use_priority: if self.n_step: print('N-Step PER') self.data = NStepPrioritizedExperienceReplay( self.batch_size, self.buffer_size, max_episode=self.max_episode, gamma=self.gamma, alpha=er_config['nper_config']['alpha'], beta=er_config['nper_config']['beta'], epsilon=er_config['nper_config']['epsilon'], agents_num=er_config['nper_config']['max_agents'], n=er_config['nper_config']['n'], global_v=er_config['nper_config']['global_v']) else: print('PER') self.data = PrioritizedExperienceReplay( self.batch_size, self.buffer_size, max_episode=self.max_episode, alpha=er_config['per_config']['alpha'], beta=er_config['per_config']['beta'], epsilon=er_config['per_config']['epsilon'], global_v=er_config['nper_config']['global_v']) else: if self.n_step: print('N-Step ER') self.data = NStepExperienceReplay( self.batch_size, self.buffer_size, gamma=self.gamma, agents_num=er_config['ner_config']['max_agents'], n=er_config['ner_config']['n']) else: print('ER') self.data = ExperienceReplay(self.batch_size, self.buffer_size) else: raise Exception('Please specific a mode of policy!') def init_placeholders(self): with self.graph.as_default(): self.pl_s = tf.placeholder(tf.float32, [None, self.s_dim], 'vector_observation') self.pl_a = tf.placeholder(tf.float32, [None, self.a_counts], 'pl_action') self.pl_r = tf.placeholder(tf.float32, [None, 1], 'reward') self.pl_s_ = tf.placeholder(tf.float32, [None, self.s_dim], 'next_state') self.pl_done = tf.placeholder(tf.float32, [None, 1], 'done') self.pl_visual_s = tf.placeholder(tf.float32, [None] + self.visual_dim, 'visual_observation_') self.pl_visual_s_ = tf.placeholder(tf.float32, [None] + self.visual_dim, 'next_visual_observation') def on_store(self, s, visual_s, a, r, s_, visual_s_, done): """ for on-policy training, use this function to store <s, a, r, s_, done> into DataFrame of Pandas. """ assert isinstance( a, np.ndarray), "on_store need action type is np.ndarray" assert isinstance( r, np.ndarray), "on_store need reward type is np.ndarray" assert isinstance(done, np.ndarray), "on_store need done type is np.ndarray" self.data = self.data.append( { 's': s, 'visual_s': visual_s, 'a': a, 'r': r, 's_': s_, 'visual_s_': visual_s_, 'done': done }, ignore_index=True) def off_store(self, s, visual_s, a, r, s_, visual_s_, done): """ for off-policy training, use this function to store <s, a, r, s_, done> into ReplayBuffer. """ assert isinstance( a, np.ndarray), "off_store need action type is np.ndarray" assert isinstance( r, np.ndarray), "off_store need reward type is np.ndarray" assert isinstance(done, np.ndarray), "off_store need done type is np.ndarray" self.data.add(s, visual_s, a, r, s_, visual_s_, done) def no_op_store(self, s, visual_s, a, r, s_, visual_s_, done): assert isinstance( a, np.ndarray), "no_op_store need action type is np.ndarray" assert isinstance( r, np.ndarray), "no_op_store need reward type is np.ndarray" assert isinstance( done, np.ndarray), "no_op_store need done type is np.ndarray" if self.policy_mode == 'OFF': self.data.add(s, visual_s, a, r[:, np.newaxis], s_, visual_s_, done[:, np.newaxis]) def clear(self): """ clear the DataFrame. """ self.data.drop(self.data.index, inplace=True) def get_max_episode(self): """ get the max episode of this training model. """ return self.max_episode
class Policy(Base): def __init__(self, a_dim_or_list, action_type, base_dir, s_dim, visual_sources, visual_resolution, gamma, max_episode, policy_mode=None, batch_size=1, buffer_size=1, use_priority=False, n_step=False): super().__init__( a_dim_or_list=a_dim_or_list, action_type=action_type, base_dir=base_dir) self.s_dim = s_dim self.visual_sources = visual_sources self.visual_dim = [visual_sources, *visual_resolution] if visual_sources else [0] self.a_dim_or_list = a_dim_or_list self.gamma = gamma self.max_episode = max_episode self.policy_mode = policy_mode self.batch_size = batch_size self.buffer_size = buffer_size self.use_priority = use_priority self.n_step = n_step self.init_data_memory() def init_data_memory(self): ''' the biggest diffenernce between policy_modes(ON and OFF) is 'OFF' mode need raise the dimension of 'r' and 'done'. 'ON' mode means program will call on_store function and use pandas dataframe to store data. 'OFF' mode will call off_store function and use replay buffer to store data. ''' if self.policy_mode == 'ON': self.data = pd.DataFrame(columns=['s', 'a', 'r', 'done']) elif self.policy_mode == 'OFF': if self.use_priority: if self.n_step: print('N-Step PER') self.data = NStepPrioritizedExperienceReplay(self.batch_size, self.buffer_size, max_episode=self.max_episode, gamma=self.gamma, alpha=er_config['nper_config']['alpha'], beta=er_config['nper_config']['beta'], epsilon=er_config['nper_config']['epsilon'], agents_num=er_config['nper_config']['max_agents'], n=er_config['nper_config']['n'], global_v=er_config['nper_config']['global_v']) else: print('PER') self.data = PrioritizedExperienceReplay(self.batch_size, self.buffer_size, max_episode=self.max_episode, alpha=er_config['per_config']['alpha'], beta=er_config['per_config']['beta'], epsilon=er_config['per_config']['epsilon'], global_v=er_config['nper_config']['global_v']) else: if self.n_step: print('N-Step ER') self.data = NStepExperienceReplay(self.batch_size, self.buffer_size, gamma=self.gamma, agents_num=er_config['ner_config']['max_agents'], n=er_config['ner_config']['n']) else: print('ER') self.data = ExperienceReplay(self.batch_size, self.buffer_size) else: raise Exception('Please specific a mode of policy!') def on_store(self, s, visual_s, a, r, s_, visual_s_, done): """ for on-policy training, use this function to store <s, a, r, s_, done> into DataFrame of Pandas. """ assert isinstance(a, np.ndarray), "on_store need action type is np.ndarray" assert isinstance(r, np.ndarray), "on_store need reward type is np.ndarray" assert isinstance(done, np.ndarray), "on_store need done type is np.ndarray" if not self.action_type == 'continuous': a = sth.action_index2one_hot(a, self.a_dim_or_list) self.data = self.data.append({ 's': s.astype(np.float32), 'visual_s': visual_s.astype(np.float32), 'a': a.astype(np.float32), 'r': r.astype(np.float32), 's_': s_.astype(np.float32), 'visual_s_': visual_s_.astype(np.float32), 'done': done.astype(np.float32) }, ignore_index=True) def off_store(self, s, visual_s, a, r, s_, visual_s_, done): """ for off-policy training, use this function to store <s, a, r, s_, done> into ReplayBuffer. """ assert isinstance(a, np.ndarray), "off_store need action type is np.ndarray" assert isinstance(r, np.ndarray), "off_store need reward type is np.ndarray" assert isinstance(done, np.ndarray), "off_store need done type is np.ndarray" if not self.action_type == 'continuous': a = sth.action_index2one_hot(a, self.a_dim_or_list) self.data.add( s.astype(np.float32), visual_s.astype(np.float32), a.astype(np.float32), r.astype(np.float32), s_.astype(np.float32), visual_s_.astype(np.float32), done.astype(np.float32) ) def no_op_store(self, s, visual_s, a, r, s_, visual_s_, done): assert isinstance(a, np.ndarray), "no_op_store need action type is np.ndarray" assert isinstance(r, np.ndarray), "no_op_store need reward type is np.ndarray" assert isinstance(done, np.ndarray), "no_op_store need done type is np.ndarray" if self.policy_mode == 'OFF': if not self.action_type == 'continuous': a = sth.action_index2one_hot(a, self.a_dim_or_list) self.data.add( s.astype(np.float32), visual_s.astype(np.float32), a.astype(np.float32), r[:, np.newaxis].astype(np.float32), s_.astype(np.float32), visual_s_.astype(np.float32), done[:, np.newaxis].astype(np.float32) ) def clear(self): """ clear the DataFrame. """ self.data.drop(self.data.index, inplace=True) def get_max_episode(self): """ get the max episode of this training model. """ return self.max_episode def get_TensorSpecs(self, *args): """ get all inputs' shape in order to fix the problem of retracting in TF2.0 """ return [tf.TensorSpec(shape=[None] + i, dtype=tf.float32) for i in args] @staticmethod def clip_nn_log_std(log_std, _min=-20, _max=2): """ scale log_std from [-1, 1] to [_min, _max] """ return _min + 0.5 * (_max - _min) * (log_std + 1) @staticmethod def gaussian_reparam_sample(mu, log_std): """ reparameter """ std = tf.exp(log_std) pi = mu + tf.random.normal(mu.shape) * std log_pi = Policy.gaussian_likelihood(pi, mu, log_std) return pi, log_pi @staticmethod def gaussian_likelihood(x, mu, log_std): pre_sum = -0.5 * (((x - mu) / (tf.exp(log_std) + 1e-8))**2 + 2 * log_std + np.log(2 * np.pi)) return tf.reduce_sum(pre_sum, axis=1, keepdims=True) @staticmethod def gaussian_entropy(log_std): return tf.reduce_mean(0.5 * (1 + tf.math.log(2 * np.pi * tf.exp(log_std)**2))) @staticmethod def squash_action(pi, log_pi=None): """ enforcing action bounds. squash action to range [-1, 1] and calculate the correct log probability value """ pi = tf.tanh(pi) if log_pi is not None: sub = tf.reduce_sum(tf.math.log(Policy.clip_but_pass_gradient(1 - pi**2, l=0, h=1) + 1e-6), axis=1, keepdims=True) log_pi -= sub return pi, log_pi @staticmethod def unsquash_action(mu, pi, log_std): """ desquash action from [-1, 1] to [-inf, inf] """ _pi = tf.atanh(pi) log_pi = Policy.gaussian_likelihood(_pi, mu, log_std) sub = tf.reduce_sum(tf.math.log(Policy.clip_but_pass_gradient(1 - pi**2, l=0, h=1) + 1e-6), axis=1, keepdims=True) log_pi -= sub return log_pi @staticmethod def clip_but_pass_gradient(x, l=-1., h=1.): """ Stole this function from SpinningUp """ clip_up = tf.cast(x > h, tf.float32) clip_low = tf.cast(x < l, tf.float32) return x + tf.stop_gradient((h - x) * clip_up + (l - x) * clip_low)