def __init__(self, config): super(A3C, self).__init__(config) self.num_processes = multiprocessing.cpu_count() self.worker_processes = max(1, self.num_processes - 2) print( f"running {self.worker_processes} worker processes plus optimizer process plus main process" ) self.actor_critic = DRRLnet(10, 10, self.action_size, n_f_conv1=12, n_f_conv2=12, att_emb_size=32, n_heads=2, n_att_stack=1, n_fc_layers=4, pad=True, baseline_mode=False, n_baseMods=3) self.actor_critic_optimizer = SharedAdam( self.actor_critic.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4) self.actor_critic_optimizer.zero_grad()
def __init__(self, config, agent_name_=agent_name): super(A3C, self).__init__(config, agent_name_=agent_name_) self.num_processes = multiprocessing.cpu_count() self.worker_processes = max(1, self.num_processes - 2) self.actor_critic = self.create_NN(input_dim=self.state_size, output_dim=[self.action_size, 1]) self.actor_critic_optimizer = SharedAdam(self.actor_critic.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4) self.wandb_watch(self.actor_critic, log_freq=self.config.wandb_model_log_freq)
def __init__(self, config): super(A3C, self).__init__(config) self.num_processes = multiprocessing.cpu_count() self.worker_processes = max(1, self.num_processes - 2) self.actor_critic = self.create_NN(input_dim=self.state_size, output_dim=[self.action_size, 1]) self.actor_critic_optimizer = SharedAdam( self.actor_critic.parameters(), lr=self.hyperparameters["learning_rate"])
def __init__(self, config): super(A3C, self).__init__(config) self.num_processes = multiprocessing.cpu_count() self.worker_processes = max(1, self.num_processes - 2) model_path = self.config.model_path if self.config.model_path else 'Models' self.actor_critic_path = os.path.join( model_path, "{}_actor_critic.pt".format(self.agent_name)) self.actor_critic = self.create_NN(input_dim=self.state_size, output_dim=[self.action_size, 1]) if self.config.load_model: self.locally_load_policy() self.actor_critic_optimizer = SharedAdam( self.actor_critic.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4)
class A3C(Base_Agent): """Actor critic A3C algorithm from deepmind paper https://arxiv.org/pdf/1602.01783.pdf""" agent_name = "A3C" def __init__(self, config): super(A3C, self).__init__(config) self.num_processes = multiprocessing.cpu_count() self.worker_processes = max(1, self.num_processes - 2) self.actor_critic = self.create_NN(input_dim=self.state_size, output_dim=[self.action_size, 1]) self.actor_critic_optimizer = SharedAdam( self.actor_critic.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4) def run_n_episodes(self): """Runs game to completion n times and then summarises results and saves model (if asked to)""" start = time.time() results_queue = Queue() gradient_updates_queue = Queue() episode_number = multiprocessing.Value('i', 0) self.optimizer_lock = multiprocessing.Lock() episodes_per_process = int( self.config.num_episodes_to_run / self.worker_processes) + 1 processes = [] self.actor_critic.share_memory() self.actor_critic_optimizer.share_memory() optimizer_worker = multiprocessing.Process( target=self.update_shared_model, args=(gradient_updates_queue, )) optimizer_worker.start() for process_num in range(self.worker_processes): worker = Actor_Critic_Worker( process_num, copy.deepcopy(self.environment), self.actor_critic, episode_number, self.optimizer_lock, self.actor_critic_optimizer, self.config, episodes_per_process, self.hyperparameters["epsilon_decay_rate_denominator"], self.action_size, self.action_types, results_queue, copy.deepcopy(self.actor_critic), gradient_updates_queue) worker.start() processes.append(worker) self.print_results(episode_number, results_queue) for worker in processes: worker.join() optimizer_worker.kill() time_taken = time.time() - start return self.game_full_episode_scores, self.rolling_results, time_taken def print_results(self, episode_number, results_queue): """Worker that prints out results as they get put into a queue""" while True: with episode_number.get_lock(): carry_on = episode_number.value < self.config.num_episodes_to_run if carry_on: if not results_queue.empty(): self.total_episode_score_so_far = results_queue.get() self.save_and_print_result() else: break def update_shared_model(self, gradient_updates_queue): """Worker that updates the shared model with gradients as they get put into the queue""" while True: gradients = gradient_updates_queue.get() with self.optimizer_lock: self.actor_critic_optimizer.zero_grad() for grads, params in zip(gradients, self.actor_critic.parameters()): params._grad = grads # maybe need to do grads.clone() self.actor_critic_optimizer.step()
class A3C(Base_Agent): """Actor critic A3C algorithm from deepmind paper https://arxiv.org/pdf/1602.01783.pdf""" agent_name = "A3C" def __init__(self, config): super(A3C, self).__init__(config) self.num_processes = multiprocessing.cpu_count() self.worker_processes = max(1, self.num_processes - 2) model_path = self.config.model_path if self.config.model_path else 'Models' self.actor_critic_path = os.path.join( model_path, "{}_actor_critic.pt".format(self.agent_name)) self.actor_critic = self.create_NN(input_dim=self.state_size, output_dim=[self.action_size, 1]) if self.config.load_model: self.locally_load_policy() self.actor_critic_optimizer = SharedAdam( self.actor_critic.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4) def run_n_episodes(self): """Runs game to completion n times and then summarises results and saves model (if asked to)""" start = time.time() results_queue = Queue() gradient_updates_queue = Queue() episode_number = multiprocessing.Value('i', 0) self.optimizer_lock = multiprocessing.Lock() episodes_per_process = int( self.config.num_episodes_to_run / self.worker_processes) + 1 processes = [] self.actor_critic.share_memory() self.actor_critic_optimizer.share_memory() optimizer_worker = multiprocessing.Process( target=self.update_shared_model, args=(gradient_updates_queue, )) optimizer_worker.start() for process_num in range(self.worker_processes): worker = Actor_Critic_Worker( process_num, copy.deepcopy(self.environment), self.actor_critic, episode_number, self.optimizer_lock, self.actor_critic_optimizer, self.config, episodes_per_process, self.hyperparameters["epsilon_decay_rate_denominator"], self.action_size, self.action_types, results_queue, copy.deepcopy(self.actor_critic), gradient_updates_queue) worker.start() processes.append(worker) self.print_results(episode_number, results_queue) for worker in processes: worker.join() # optimizer_worker.kill() optimizer_worker.terminate() if self.config.save_model: self.locally_save_policy() time_taken = time.time() - start return self.game_full_episode_scores, self.rolling_results, time_taken def run_test(self, show_whether_achieved_goal=True, save_and_print_results=True): """Runs game to completion n times and then summarises results and saves model (if asked to)""" start = time.time() results_queue = Queue() gradient_updates_queue = Queue() episode_number = multiprocessing.Value('i', 0) self.optimizer_lock = multiprocessing.Lock() episodes_per_process = int( self.config.num_episodes_to_run / self.worker_processes) + 1 processes = [] self.actor_critic.share_memory() self.actor_critic_optimizer.share_memory() optimizer_worker = multiprocessing.Process( target=self.update_shared_model, args=(gradient_updates_queue, )) optimizer_worker.start() for process_num in range(1): worker = Actor_Critic_Worker( process_num, copy.deepcopy(self.environment), self.actor_critic, episode_number, self.optimizer_lock, self.actor_critic_optimizer, self.config, 1, self.hyperparameters["epsilon_decay_rate_denominator"], self.action_size, self.action_types, results_queue, copy.deepcopy(self.actor_critic), gradient_updates_queue) worker.start() processes.append(worker) self.print_results(episode_number, results_queue) for worker in processes: worker.join() # optimizer_worker.kill() optimizer_worker.terminate() if self.config.save_model: self.locally_save_policy() plt.cla() print("id", self.config.environment._position_history) self.config.environment.render_all() if self.config.run_test_path: plt.savefig(self.config.run_test_path.format( self.agent_name)) # , bbox_inches="tight") plt.show() return self.game_full_episode_scores, self.rolling_results, time_taken def print_results(self, episode_number, results_queue): """Worker that prints out results as they get put into a queue""" while True: with episode_number.get_lock(): carry_on = episode_number.value < self.config.num_episodes_to_run if carry_on: if not results_queue.empty(): self.total_episode_score_so_far = results_queue.get() self.save_and_print_result() else: break def update_shared_model(self, gradient_updates_queue): """Worker that updates the shared model with gradients as they get put into the queue""" while True: gradients = gradient_updates_queue.get() with self.optimizer_lock: self.actor_critic_optimizer.zero_grad() for grads, params in zip(gradients, self.actor_critic.parameters()): params._grad = grads # maybe need to do grads.clone() self.actor_critic_optimizer.step() def locally_save_policy(self): """Saves the policy""" """保存策略,待添加""" torch.save(self.actor_critic.state_dict(), self.actor_critic_path) def locally_load_policy(self): print("locall_load_policy") if os.path.isfile(self.actor_critic_path): print("load actor_critic_path") self.actor_critic.load_state_dict( torch.load(self.actor_critic_path))