コード例 #1
0
ファイル: simulation.py プロジェクト: zyuchuan/huskarl
    def _sp_train(self, max_steps, instances, visualize, plot):
        """Trains using a single process."""
        # Keep track of rewards per episode per instance
        episode_reward_sequences = [[] for i in range(instances)]
        episode_step_sequences = [[] for i in range(instances)]
        episode_rewards = [0] * instances

        # Create and initialize environment instances
        envs = [self.create_env() for i in range(instances)]
        states = [env.reset() for env in envs]

        for step in range(max_steps):
            for i in range(instances):
                if visualize: envs[i].render()
                action = self.agent.act(states[i], i)
                next_state, reward, done, _ = envs[i].step(action)
                self.agent.push(
                    Transition(states[i], action, reward,
                               None if done else next_state), i)
                episode_rewards[i] += reward
                if done:
                    episode_reward_sequences[i].append(episode_rewards[i])
                    episode_step_sequences[i].append(step)
                    episode_rewards[i] = 0
                    if plot:
                        plot(episode_reward_sequences, episode_step_sequences)
                    states[i] = envs[i].reset()
                else:
                    states[i] = next_state
            # Perform one step of the optimization
            self.agent.train(step)

        if plot:
            plot(episode_reward_sequences, episode_step_sequences, done=True)
コード例 #2
0
	def _train(self, max_steps):
		# Keep track of rewards per episode per instance
		n_games = 0
		SARS_buffer = []

		# Create and initialize environment instances
		env = self.create_env(self.num_players)
		state = env.reset()
		agents = [self.agent] + [self.other_agent for _ in range(self.num_players - 1)]
		single_player = self.num_players == 1

		for step in range(max_steps):
			turn = env.turn
			action = agents[turn].act(state)
			next_state, reward, done, _ = env.step(action)
			if (single_player):
				self.agent.push(Transition(state, action, reward, None if done else next_state))
			elif (turn == 0):
				SARS_buffer.append(Transition(state, action, turn, None if done else next_state))

			if done:
				# reward here tells which player the SARS tuple is from
				# so if the winner played the SA pair, 1 else -1
				if (not single_player):
					reward = 1 if env.winner() == 0 else -1
					for SARS in SARS_buffer:
						win_reward = 1 if env.winner() == SARS.reward else -1
						self.agent.push(Transition(SARS.state, SARS.action, win_reward, SARS.next_state))
					SARS_buffer = []

				n_games += 1
				self.update(n_games)
				state = env.reset()
			else:
				state = next_state
			# Perform one step of the optimization
			self.agent.train(step)

		self.update(n_games, done=True)
コード例 #3
0
ファイル: simulation.py プロジェクト: zyuchuan/huskarl
    def _mp_train(self, max_steps, instances, visualize, plot,
                  max_subprocesses):
        """Trains using multiple processes.
		
		Useful to parallelize the computation of heavy environments.
		"""
        # Unless specified set the maximum number of processes to be the number of cores in the machine
        if max_subprocesses is None:
            max_subprocesses = mp.cpu_count()
        nprocesses = min(instances, max_subprocesses)

        # Split instances into processes as homogeneously as possibly
        instances_per_process = [instances // nprocesses] * nprocesses
        leftover = instances % nprocesses
        if leftover > 0:
            for i in range(leftover):
                instances_per_process[i] += 1

        # Create a unique id (index) for each instance, grouped by process
        instance_ids = [
            list(range(i, instances, nprocesses))[:ipp]
            for i, ipp in enumerate(instances_per_process)
        ]

        # Create processes and pipes (one pipe for each environment instance)
        pipes = []
        processes = []
        for i in range(nprocesses):
            child_pipes = []
            for j in range(instances_per_process[i]):
                parent, child = mp.Pipe()
                pipes.append(parent)
                child_pipes.append(child)
            pargs = (cloudpickle.dumps(self.create_env), instance_ids[i],
                     max_steps, child_pipes, visualize)
            processes.append(mp.Process(target=_train, args=pargs))

        # Start all processes
        print(
            f"Starting {nprocesses} process(es) for {instances} environment instance(s)... {instance_ids}"
        )
        for p in processes:
            p.start()

        # Keep track of rewards per episode per instance
        episode_reward_sequences = [[] for i in range(instances)]
        episode_step_sequences = [[] for i in range(instances)]
        episode_rewards = [0] * instances

        # Temporarily record RewardState instances received from each subprocess
        # Each Transition instance requires two RewardState instances to be created
        rss = [None] * instances

        # Keep track of last actions sent to subprocesses
        last_actions = [None] * instances

        for step in range(max_steps):

            # Keep track from which environments we have already constructed a full Transition instance
            # and sent it to agent. This is to synchronize steps.
            step_done = [False] * instances

            while sum(
                    step_done
            ) < instances:  # Steps across environments are synchronized

                # Within each step, Transitions are received and processed on a first-come first-served basis
                awaiting_pipes = [
                    p for iid, p in enumerate(pipes) if step_done[iid] == 0
                ]
                ready_pipes = mp.connection.wait(awaiting_pipes, timeout=None)
                pipe_indexes = [pipes.index(rp) for rp in ready_pipes]

                # Do a round-robin over processes to best divide computation
                pipe_indexes.sort()
                for iid in pipe_indexes:
                    rs = pipes[iid].recv()  # Receive a RewardState

                    # If we already had a RewardState for this environment then we are able to create and push a Transition
                    if rss[iid] is not None:
                        exp = Transition(rss[iid].state, last_actions[iid],
                                         rs.reward, rs.state)
                        self.agent.push(exp, iid)
                        step_done[iid] = True
                    rss[iid] = rs

                    # Check if episode is done
                    if rs.state is None:
                        # Episode is done - store rewards and update plot
                        rss[iid] = None
                        episode_reward_sequences[iid].append(
                            episode_rewards[iid])
                        episode_step_sequences[iid].append(step)
                        episode_rewards[iid] = 0
                        if plot:
                            plot(episode_reward_sequences,
                                 episode_step_sequences)
                    else:
                        # Episode is NOT done - act according to state and send action to the subprocess
                        action = self.agent.act(rs.state, iid)
                        last_actions[iid] = action
                        try:
                            pipes[iid].send(action)
                        # Disregard BrokenPipeError on last step
                        except BrokenPipeError as bpe:
                            if step < (max_steps - 1): raise bpe
                        if rs.reward: episode_rewards[iid] += rs.reward

            # Train the agent at the end of every synchronized step
            self.agent.train(step)

        if plot:
            plot(episode_reward_sequences, episode_step_sequences, done=True)