class Brain(object): def __init__(self): self.env = gym.make(GAME) self.actor = Actor(N_FEATURE, N_ACTION, A_BOUND) self.critic = Critic(N_FEATURE, N_ACTION) self.actor.build(input_shape=(None, self.actor.n_feature)) self.critic.build(input_shape=(None, self.critic.n_feature)) def work(self, queues_para, queues_buffer): self.syn_global_para() while True: for i in np.arange(N_WORKER): grad_ac, grad_ct = queues_buffer[i].get() self.actor.apply_grad(grad_ac) self.critic.apply_grad(grad_ct) para_ac = self.actor.get_weights() para_ct = self.critic.get_weights() queues_para[i].put([para_ac, para_ct]) if GLOBAL_EPISODE.value == MAX_EPISODE: break def syn_global_para(self): para_ac = self.actor.get_weights() para_ct = self.critic.get_weights() for i in np.arange(N_WORKER): queues_para[i].put([para_ac, para_ct])
class Worker(object): def __init__(self, name): self.name = name self.env = gym.make(GAME) self.actor = Actor(N_FEATURE, N_ACTION, A_BOUND) self.critic = Critic(N_FEATURE, N_ACTION) self.actor.build(input_shape=(None, self.actor.n_feature)) self.critic.build(input_shape=(None, self.critic.n_feature)) def work(self, queue_para, queue_buffer): buffer_s, buffer_r, buffer_a, buffer_s_ = [], [], [], [] ## syn_net_paras self.updata_para(queue_para) while GLOBAL_EPISODE.value < MAX_EPISODE: episode_reword = 0 obsveration = self.env.reset() for episode_step in np.arange(1, MAX_STEP): if self.name == 'worker_0': self.env.render() # self.env.render() obsveration = obsveration[np.newaxis, :] action = self.actor.choose_action(obsveration) obsveration_, reward, done, _ = self.env.step(action) done = True if episode_step == MAX_STEP - 1 else False obsveration = np.squeeze(obsveration) action = np.squeeze(action, axis=(0, 1)) reward = np.squeeze(reward, axis=(0)) obsveration_ = np.squeeze(obsveration_) episode_reword += reward buffer_s.append(obsveration) buffer_a.append(action) buffer_r.append((reward + 8) / 8) buffer_s_.append(obsveration_) if episode_step % UPDATE_CIRCLE == 0 or done: s = tf.convert_to_tensor(buffer_s, dtype=tf.float32) a = tf.convert_to_tensor(buffer_a, dtype=tf.float32) r = tf.convert_to_tensor(buffer_r, dtype=tf.float32) s_ = tf.convert_to_tensor(buffer_s_, dtype=tf.float32) td_error, grad_ct = self.critic.compute_loss(s, r, s_) grad_ac = self.actor.compute_loss(s, a, td_error) queue_buffer.put([grad_ac, grad_ct]) self.updata_para(queue_para) buffer_s.clear() buffer_a.clear() buffer_s_.clear() buffer_r.clear() if done: LOCK_STEP.acquire() i = GLOBAL_EPISODE.value if i == 0: GLOBAL_REWARD[i] = episode_reword else: GLOBAL_REWARD[i] = 0.9 * GLOBAL_REWARD[i - 1] + 0.1 * episode_reword print('name: {}, global episode: {}, average reward: {}'.format(self.name, i, GLOBAL_REWARD[i - 1])) GLOBAL_EPISODE.value += 1 LOCK_STEP.release() break obsveration = obsveration_ def updata_para(self, queue_para): para_ac, para_ct = queue_para.get() self.actor.set_weights(para_ac) self.critic.set_weights(para_ct)