def worker(env_name, proc_num, state_sender, result_sender, action_receiver, reset_receiver, motion_receiver): """ :type env_name: str :type proc_num: int :type result_sender: Connection :type action_receiver: Connection :return: """ # reset variable # 0 : go on (no reset) # 1 : soft reset ( w/o motion change ) # 2 : hard reset ( with motion change ) env = HpDartEnv(env_name) state = None while True: reset_flag = reset_receiver.recv() if reset_flag == 1: state = env.reset() elif reset_flag == 2: goals, qs = motion_receiver.recv() env.update_target(goals, qs) state = env.reset() state_sender.send(state) action = action_receiver.recv() state, reward, is_done, _ = env.step(action) result_sender.send((reward, is_done))
def worker(env_name, proc_num, result_sender, action_receiver, reset_receiver): """ :type env_name: str :type proc_num: int :type result_sender: Connection :type action_receiver: Connection :return: """ env = HpDartEnv(env_name) new_start = reset_receiver.recv() if new_start: env.reset() new_start = False local_step = 0 while True: # print(proc_num, local_step) # print(proc_num, 'state_send') result_sender.send(env.state()) # print(proc_num, 'action_recv') action = action_receiver.recv() env.step(action) local_step += 1 reward, is_done = env.reward(), env.is_done() # print(proc_num, is_done, local_step, env.world.time()) # print(proc_num, 'reward_send') result_sender.send((reward, is_done)) if is_done: # print(proc_num, 'reset_recv') new_start = reset_receiver.recv() if new_start: env.reset() new_start = False
class HpPPO(object): def __init__(self, session, env_name='walk', num_slaves=1): self.sess = session self.env = HpDartEnv(env_name) self.num_slaves = num_slaves self.num_state = self.env.observation_space.shape[0] self.num_action = self.env.action_space.shape[0] self.action_bound = [ self.env.action_space.low, self.env.action_space.high ] self.num_train = 0 self.layer_size = [128, 64] self.error_mag = 0.1 self.num_epoches = 10 # self.sample_size = 256 self.sample_size = 2048 self.batch_size = 128 self.gamma = 0.95 self.td_lambda = 0.95 self.clip_ratio = 0.2 # set memory and episodes self.replay_buffer = Replay() self.total_episodes = list() # type: list[Episode] # set varialbles with tf.variable_scope('state'): self.state = tf.placeholder(tf.float32, shape=[None, self.num_state]) with tf.variable_scope('action'): self.action = tf.placeholder(tf.float32, shape=[None, self.num_action]) with tf.variable_scope('target_value'): self.y = tf.placeholder(tf.float32, shape=[None, 1]) with tf.variable_scope('advantages'): self.advantages = tf.placeholder(tf.float32, shape=[None, 1]) # build networks self.value = self.build_value_net() self.actor, self.actor_param = self.build_actor_net('actor_net', trainable=True) self.actor_old, self.actor_old_param = self.build_actor_net( 'actor_old', trainable=False) self.syn_old_pi = [ oldp.assign(p) for p, oldp in zip(self.actor_param, self.actor_old_param) ] self.sample_op = tf.clip_by_value( tf.squeeze(self.actor.sample(1), axis=0), self.action_bound[0], self.action_bound[1]) # set loss function with tf.variable_scope('critic_loss'): self.adv = self.y - self.value self.critic_loss = tf.reduce_mean(tf.square(self.adv)) with tf.variable_scope('actor_loss'): ratio = self.actor.prob(self.action) / self.actor_old.prob( self.action) self.actor_loss = tf.reduce_mean( tf.minimum( ratio * self.advantages, tf.clip_by_value(ratio, 1. - self.clip_ratio, 1. + self.clip_ratio))) # set optimizer self.value_step_size = 1e-2 self.value_optimizer = tf.train.AdamOptimizer(self.value_step_size) self.train_critic = self.value_optimizer.minimize(self.critic_loss) self.policy_step_size = 1e-4 self.policy_optimizer = tf.train.AdamOptimizer(self.policy_step_size) self.train_policy = self.value_optimizer.minimize(self.actor_loss) # for evaluation self.num_eval = 0 # for multiprocessing self.state_sender = [] # type: list[Connection] self.result_sender = [] # type: list[Connection] self.state_receiver = [] # type: list[Connection] self.result_receiver = [] # type: list[Connection] self.action_sender = [] # type: list[Connection] self.reset_sender = [] # type: list[Connection] self.motion_sender = [] # type: list[Connection] self.envs = [] # type: list[Process] def init_envs(self): for slave_idx in range(self.num_slaves): s_s, s_r = Pipe() r_s, r_r = Pipe() a_s, a_r = Pipe() reset_s, reset_r = Pipe() motion_s, motion_r = Pipe() p = Process(target=worker, args=(self.rnn_len, slave_idx, s_s, r_s, a_r, reset_r, motion_r)) self.state_sender.append(s_s) self.result_sender.append(r_s) self.state_receiver.append(s_r) self.result_receiver.append(r_r) self.action_sender.append(a_s) self.reset_sender.append(reset_s) self.motion_sender.append(motion_s) self.envs.append(p) p.start() def envs_get_states(self, terminated): states = [] for recv_idx in range(len(self.state_receiver)): if terminated[recv_idx]: states.append([0.] * self.num_state) else: states.append(self.state_receiver[recv_idx].recv()) return states def envs_send_actions(self, actions, terminated): for i in range(len(self.action_sender)): if not terminated[i]: self.action_sender[i].send(actions[i]) def envs_get_status(self, terminated): status = [] for recv_idx in range(len(self.result_receiver)): if terminated[recv_idx]: status.append((0., True)) else: status.append(self.result_receiver[recv_idx].recv()) return zip(*status) def envs_resets(self, reset_flag): for i in range(len(self.reset_sender)): self.reset_sender[i].send(reset_flag) def envs_reset(self, i, reset_flag): self.reset_sender[i].send(reset_flag) def build_value_net(self): # build networks with tf.variable_scope('value_net'): value_dl1 = tf.contrib.layers.fully_connected( inputs=self.state, num_outputs=self.layer_size[0], activation_fn=tf.nn.relu, scope='value_dl1') value_dl2 = tf.contrib.layers.fully_connected( inputs=value_dl1, num_outputs=self.layer_size[1], activation_fn=tf.nn.relu, scope='value_dl2') value = tf.contrib.layers.fully_connected(inputs=value_dl2, num_outputs=1, activation_fn=None, scope='value') return value def build_actor_net(self, scope, trainable): with tf.variable_scope(scope): actor_dl1 = tf.contrib.layers.fully_connected( inputs=self.state, num_outputs=self.layer_size[0], activation_fn=tf.nn.relu, trainable=trainable, scope='dl1') actor_dl2 = tf.contrib.layers.fully_connected( inputs=actor_dl1, num_outputs=self.layer_size[1], activation_fn=tf.nn.relu, trainable=trainable, scope='dl2') mu = tf.contrib.layers.fully_connected(inputs=actor_dl2, num_outputs=self.num_action, activation_fn=None, trainable=trainable, scope='mu') sigma = tf.contrib.layers.fully_connected( inputs=actor_dl2, num_outputs=self.num_action, activation_fn=tf.nn.softplus, trainable=trainable, scope='sigma') # sigma = tf.convert_to_tensor(0.1 * np.ones(self.num_action), dtype=np.float32) actor_dist = tf.contrib.distributions.Normal(loc=mu, scale=sigma) param = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope) return actor_dist, param def get_action(self, s): return self.sess.run(self.sample_op, feed_dict={self.state: s[np.newaxis, :]}) def get_v(self, s): if s.ndim < 2: s = s[np.newaxis, :] return self.sess.run(self.value, feed_dict={self.state: s})[0, 0] def train(self): self.generate_transitions() self.optimize_model() self.num_train += 1 def generate_transitions(self): del self.total_episodes[:] episodes = [Episode() for _ in range(self.num_slaves)] terminated = [False for _ in range(self.num_slaves)] self.env.Resets(True) # self.envs_resets(1) local_step = 0 while True: states = self.env.GetStates() # states = self.envs_get_states(terminated) actions = np.asarray(self.get_action(states)) values = self.get_v(states) logprobs = self.actor.prob(actions) # self.envs_send_actions(actions, terminated) # rewards, is_done = self.envs_get_status(terminated) __, reward, is_done, info = self.env.step(actions.flatten()) rewards = [reward] is_dones = [is_done] for j in range(self.num_slaves): if terminated[j]: continue nan_occur = np.any(np.isnan(states[j])) or np.any( np.isnan(actions[j])) if not nan_occur: episodes[j].append((states[j], actions[j], rewards[j], values[j], logprobs[j])) if is_dones[j] or nan_occur: self.total_episodes.append(deepcopy(episodes[j])) if local_step < self.sample_size: episodes[j] = Episode() # self.envs_reset(j, 1) self.env.reset() else: terminated[j] = True else: # self.envs_reset(j, 0) pass if local_step >= self.sample_size and all(terminated): break def optimize_model(self): self.compute_td_gae() for _ in range(self.num_epoches): transitions = self.replay_buffer.sample(self.batch_size) batch = list(zip(*transitions)) td = batch[3] self.update_value() self.update_policy() def compute_td_gae(self): for epi in self.total_episodes: len_epi = len(epi) states, actions, rewards, values, logprobs = zip(*epi) values = np.concatenate((values, np.zeros(1)), axis=0) advantages = np.zeros(len_epi) ad_t = 0 for i in reversed(range(len_epi)): delta = rewards[i] + values[i + 1] * self.gamma - values[i] ad_t = delta + self.gamma * self.td_lambda * ad_t advantages[i] = ad_t TD = values[:len_epi] + advantages for i in range(len_epi): self.replay_buffer.append( (states[i], actions[i], logprobs[i], TD[i], advantages[i])) def update_value(self): pass def update_policy(self): pass def evaluate(self): self.num_eval += 1 total_reward = 0 total_step = 0 self.env.Reset(False, 0) state = self.env.GetState(0) for t in count(): action = np.asarray(self.actor.mean().eval( feed_dict={ppo.state: [state]})).flatten() state, reward, is_done, info = self.env.step(action) if is_done: break else: total_step += 1 total_reward += reward # print('noise: {:.3f}'.format(self.actor.stddev().eval(feed_dict={ppo.state: [state]}))) print('noise: ', self.actor.stddev().eval(feed_dict={ppo.state: [state]})) if total_step > 0: print('Epi reward : {:.2f}, Step reward : {:.2f} Total step : {}'. format(total_reward, total_reward / total_step, total_step)) else: print('bad') return total_reward, total_step