def __init__(self, number, num_actions, trainer, model_name): self.name = "worker_" + str(number) self.number = number self.model_name = model_name # Create the local copy of the network and the tensorflow op to copy global paramters to local network self.local_ac = ACNet(num_actions, self.name, trainer) self.update_target_graph = self.update_target(global_scope_name, self.name) self.env = EnvVizDoom(vizdoom_scenario)
reward = env.Act(action, 1) reward_total += reward if (not env.IsRunning()): break state_raw = env.Observation() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--gpu", help="the GPU to use") args = parser.parse_args() if (args.gpu): os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu if (lab): env = EnvLab(80, 80, 60, "seekavoid_arena_01") else: env = EnvVizDoom(vizdoom_scenario) agent = Agent(env.NumActions()) if (train): agent.Train() Test(agent)
if (test_write_video): out_video.write(state_raw) reward = env.Act(action, 1) reward_total += reward if (not env.IsRunning()): break state_raw = env.Observation() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--gpu", help="the GPU to use") args = parser.parse_args() if (args.gpu): os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu env = EnvVizDoom(vizdoom_scenario) agent = Agent(env.NumActions()) if (train): agent.Train() Test(agent)
class Worker(object): def __init__(self, number, num_actions, trainer, model_name): self.name = "worker_" + str(number) self.number = number self.model_name = model_name # Create the local copy of the network and the tensorflow op to copy global paramters to local network self.local_ac = ACNet(num_actions, self.name, trainer) self.update_target_graph = self.update_target(global_scope_name, self.name) self.env = EnvVizDoom(vizdoom_scenario) # Copies one set of variables to another. # Used to set worker network parameters to those of global network. def update_target(self, from_scope, to_scope): from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope) to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope) op_holder = [] for from_var, to_var in zip(from_vars, to_vars): op_holder.append(to_var.assign(from_var)) return op_holder # Calculate discounted returns. def Discount(self, x, gamma): for idx in reversed(range(len(x) - 1)): x[idx] += x[idx + 1] * gamma return x def Start(self, session, saver, coord): worker_process = lambda: self.Process(session, saver, coord) thread = threading.Thread(target=worker_process) thread.start() global start_time start_time = time.time() return thread def Train(self, episode_buffer, sess, bootstrap_value): episode_buffer = np.array(episode_buffer) states = episode_buffer[:, 0] actions = episode_buffer[:, 1] rewards = episode_buffer[:, 2] values = episode_buffer[:, 3] # Here we take the rewards and values from the episode_buffer, and use them to # generate the advantage and discounted returns. # The advantage function uses "Generalized Advantage Estimation" rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) discounted_rewards = self.Discount(rewards_plus, gamma)[:-1] value_plus = np.asarray(values.tolist() + [bootstrap_value]) advantages = rewards + gamma * value_plus[1:] - value_plus[:-1] advantages = self.Discount(advantages, gamma) # Update the global network using gradients from loss # Generate network statistics to periodically save self.local_ac.Train(sess, discounted_rewards, states, actions, advantages) def Process(self, sess, saver, coord): global step, train_scores, start_time, lock print("Starting worker " + str(self.number)) while (not coord.should_stop()): sess.run(self.update_target_graph) episode_buffer = [] episode_reward = 0 self.env.Reset() s = self.env.Observation() s = Preprocess(s) self.local_ac.ResetLstm() while (self.env.IsRunning()): # Take an action using probabilities from policy network output. a, v = self.local_ac.GetAction(sess, s) r = self.env.Act(a, frame_repeat) finished = not self.env.IsRunning() if (not finished): s1 = self.env.Observation() s1 = Preprocess(s1) else: s1 = None episode_buffer.append([s, a, r, v]) episode_reward += r s = s1 lock.acquire() step += 1 if (step % save_each == 0): model_name_curr = self.model_name + "_{:04}".format( int(step / save_each)) print("\nSaving the network weigths to:", model_name_curr, file=sys.stderr) saver.save(sess, model_name_curr) PrintStat(time.time() - start_time, step, step_num, train_scores) train_scores = [] if (step == step_num): coord.request_stop() lock.release() # If the episode hasn't ended, but the experience buffer is full, then we # make an update step using that experience rollout. if (len(episode_buffer) == t_max or (finished and len(episode_buffer) > 0)): # Since we don't know what the true final return is, # we "bootstrap" from our current value estimation. if (not finished): v1 = self.local_ac.GetValue(sess, s) self.Train(episode_buffer, sess, v1) episode_buffer = [] sess.run(self.update_target_graph) else: self.Train(episode_buffer, sess, 0.0) lock.acquire() train_scores.append(episode_reward) lock.release()