def _sp_train(self, max_steps, instances, visualize, plot): """Trains using a single process.""" # Keep track of rewards per episode per instance episode_reward_sequences = [[] for i in range(instances)] episode_step_sequences = [[] for i in range(instances)] episode_rewards = [0] * instances # Create and initialize environment instances envs = [self.create_env() for i in range(instances)] states = [env.reset() for env in envs] for step in range(max_steps): for i in range(instances): if visualize: envs[i].render() action = self.agent.act(states[i], i) next_state, reward, done, _ = envs[i].step(action) self.agent.push( Transition(states[i], action, reward, None if done else next_state), i) episode_rewards[i] += reward if done: episode_reward_sequences[i].append(episode_rewards[i]) episode_step_sequences[i].append(step) episode_rewards[i] = 0 if plot: plot(episode_reward_sequences, episode_step_sequences) states[i] = envs[i].reset() else: states[i] = next_state # Perform one step of the optimization self.agent.train(step) if plot: plot(episode_reward_sequences, episode_step_sequences, done=True)
def _train(self, max_steps): # Keep track of rewards per episode per instance n_games = 0 SARS_buffer = [] # Create and initialize environment instances env = self.create_env(self.num_players) state = env.reset() agents = [self.agent] + [self.other_agent for _ in range(self.num_players - 1)] single_player = self.num_players == 1 for step in range(max_steps): turn = env.turn action = agents[turn].act(state) next_state, reward, done, _ = env.step(action) if (single_player): self.agent.push(Transition(state, action, reward, None if done else next_state)) elif (turn == 0): SARS_buffer.append(Transition(state, action, turn, None if done else next_state)) if done: # reward here tells which player the SARS tuple is from # so if the winner played the SA pair, 1 else -1 if (not single_player): reward = 1 if env.winner() == 0 else -1 for SARS in SARS_buffer: win_reward = 1 if env.winner() == SARS.reward else -1 self.agent.push(Transition(SARS.state, SARS.action, win_reward, SARS.next_state)) SARS_buffer = [] n_games += 1 self.update(n_games) state = env.reset() else: state = next_state # Perform one step of the optimization self.agent.train(step) self.update(n_games, done=True)
def _mp_train(self, max_steps, instances, visualize, plot, max_subprocesses): """Trains using multiple processes. Useful to parallelize the computation of heavy environments. """ # Unless specified set the maximum number of processes to be the number of cores in the machine if max_subprocesses is None: max_subprocesses = mp.cpu_count() nprocesses = min(instances, max_subprocesses) # Split instances into processes as homogeneously as possibly instances_per_process = [instances // nprocesses] * nprocesses leftover = instances % nprocesses if leftover > 0: for i in range(leftover): instances_per_process[i] += 1 # Create a unique id (index) for each instance, grouped by process instance_ids = [ list(range(i, instances, nprocesses))[:ipp] for i, ipp in enumerate(instances_per_process) ] # Create processes and pipes (one pipe for each environment instance) pipes = [] processes = [] for i in range(nprocesses): child_pipes = [] for j in range(instances_per_process[i]): parent, child = mp.Pipe() pipes.append(parent) child_pipes.append(child) pargs = (cloudpickle.dumps(self.create_env), instance_ids[i], max_steps, child_pipes, visualize) processes.append(mp.Process(target=_train, args=pargs)) # Start all processes print( f"Starting {nprocesses} process(es) for {instances} environment instance(s)... {instance_ids}" ) for p in processes: p.start() # Keep track of rewards per episode per instance episode_reward_sequences = [[] for i in range(instances)] episode_step_sequences = [[] for i in range(instances)] episode_rewards = [0] * instances # Temporarily record RewardState instances received from each subprocess # Each Transition instance requires two RewardState instances to be created rss = [None] * instances # Keep track of last actions sent to subprocesses last_actions = [None] * instances for step in range(max_steps): # Keep track from which environments we have already constructed a full Transition instance # and sent it to agent. This is to synchronize steps. step_done = [False] * instances while sum( step_done ) < instances: # Steps across environments are synchronized # Within each step, Transitions are received and processed on a first-come first-served basis awaiting_pipes = [ p for iid, p in enumerate(pipes) if step_done[iid] == 0 ] ready_pipes = mp.connection.wait(awaiting_pipes, timeout=None) pipe_indexes = [pipes.index(rp) for rp in ready_pipes] # Do a round-robin over processes to best divide computation pipe_indexes.sort() for iid in pipe_indexes: rs = pipes[iid].recv() # Receive a RewardState # If we already had a RewardState for this environment then we are able to create and push a Transition if rss[iid] is not None: exp = Transition(rss[iid].state, last_actions[iid], rs.reward, rs.state) self.agent.push(exp, iid) step_done[iid] = True rss[iid] = rs # Check if episode is done if rs.state is None: # Episode is done - store rewards and update plot rss[iid] = None episode_reward_sequences[iid].append( episode_rewards[iid]) episode_step_sequences[iid].append(step) episode_rewards[iid] = 0 if plot: plot(episode_reward_sequences, episode_step_sequences) else: # Episode is NOT done - act according to state and send action to the subprocess action = self.agent.act(rs.state, iid) last_actions[iid] = action try: pipes[iid].send(action) # Disregard BrokenPipeError on last step except BrokenPipeError as bpe: if step < (max_steps - 1): raise bpe if rs.reward: episode_rewards[iid] += rs.reward # Train the agent at the end of every synchronized step self.agent.train(step) if plot: plot(episode_reward_sequences, episode_step_sequences, done=True)