Beispiel #1
0
    def collect_trajectories(self, tmax, nrand=5):

        # number of parallel instances
        n = len(self.envs.ps)

        # initialize returning lists and start the game!
        state_list = []
        reward_list = []
        prob_list = []
        action_list = []

        self.envs.reset()

        # start all parallel agents
        self.envs.step([1] * n)

        # perform nrand random steps
        for _ in range(nrand):
            fr1, re1, _, _ = self.envs.step(
                np.random.choice([pong_utils.RIGHT, pong_utils.LEFT], n))
            fr2, re2, _, _ = self.envs.step([0] * n)

        for t in range(tmax):

            # prepare the input
            # preprocess_batch properly converts two frames into
            # shape (n, 2, 80, 80), the proper input for the policy
            # this is required when building CNN with pytorch
            batch_input = pong_utils.preprocess_batch([fr1, fr2])

            # probs will only be used as the pi_old
            # no gradient propagation is needed
            # so we move it to the cpu
            probs_tensor = self.policy(batch_input).detach()
            m = Categorical(probs_tensor)
            action = m.sample().unsqueeze(1)
            probs = torch.gather(probs_tensor, 1, action)
            action = action.cpu().numpy()

            # advance the game (0=no action)a
            # we take one action and skip game forward
            fr1, re1, is_done, _ = self.envs.step(action + 4)
            fr2, re2, is_done, _ = self.envs.step([0] * n)

            reward = re1 + re2

            # store the result
            state_list.append(batch_input)
            reward_list.append(reward)
            prob_list.append(probs)
            action_list.append(action)

            # stop if any of the trajectories is done
            # we want all the lists to be retangular
            if is_done.any():
                break

        # return pi_theta, states, actions, rewards, probability
        return prob_list, state_list, \
               action_list, reward_list
Beispiel #2
0
 def act(self, policy):
     batch_input = pong_utils.preprocess_batch([self.f1, self.f2])
     probs = policy(batch_input)
     dist = torch.distributions.Categorical(probs=probs)
     action = dist.sample
     if action == 0:
         moving = [0, -5]
     elif action == 1:
         moving = [4, -3]
     else:
         moving = [-4, -3]
     return moving
Beispiel #3
0
def test_env(vis=False):
    f1 = env.reset()
    f2, _, _, _ = env.step(0)
    if vis: env.render()
    done = False
    total_reward = 0
    while not done:
        state = preprocess_batch([f1, f2])
        pi, _ = model(state)
        dist = Categorical(pi)
        f1, r1, done, _ = env.step(dist.sample().cpu().numpy()[0])
        f2, r2, done, _ = env.step(dist.sample().cpu().numpy()[0])
        reward = r1 + r2

        if vis: env.render()
        total_reward += reward
        # print(total_reward)
    return total_reward
def collect_trajectories(envs, agent, tmax=200, nrand=5):
    '''
    Collect trajectories of multiple agents of a parallelized environment
    '''
    n = len(envs.ps)

    state_list = []
    reward_list = []
    prob_list = []
    action_list = []

    envs.reset()
    envs.step([1] * n)  # Start all parallel agents

    # perform nrand random steps
    for _ in range(nrand):
        fr1, re1, _, _ = envs.step(np.random.choice([RIGHT, LEFT], n))
        fr2, re2, _, _ = envs.step([0] * n)

    for t in range(tmax):

        batch_input = preprocess_batch([fr1, fr2])
        probs = agent(batch_input).squeeze().cpu().detach().numpy()
        action = np.where(np.random.rand(n) < probs, RIGHT, LEFT)
        probs = np.where(action == RIGHT, probs, 1.0 - probs)

        fr1, re1, is_done, _ = envs.step(action)
        fr2, re2, is_done, _ = envs.step([0] * n)

        reward = re1 + re2

        # Collect the result
        state_list.append(batch_input)
        reward_list.append(reward)
        prob_list.append(probs)
        action_list.append(action)

        # Stop if any of the trajectories is done
        if is_done.any():
            break

    return prob_list, state_list, action_list, reward_list
def test_env(vis=False):
    # print("testtest")
    f1 = env.reset()
    f2, _, _, _ = env.step(env.action_space.sample())
    if vis: env.render()
    done = False
    total_reward = 0
    while not done:
        state = preprocess_batch([f1, f2])
        # print(state.shape)
        pi, _ = model(state)
        # print(pi)
        action = Categorical(pi).sample().cpu().numpy()
        # print(action)
        f1, r1, done, _ = env.step(action)
        f2, r2, done, _ = env.step(action)
        reward = r1 + r2

        if vis: env.render()
        total_reward += reward
        print(total_reward)
    return total_reward
Beispiel #6
0
early_stop = False

while not early_stop:
    log_probs = []
    values = []
    states = []
    actions = []
    rewards = []
    masks = []
    next_states = []

    for _ in range(num_steps
                   ):  #경험 모으기 - gpu 쓰는구나 . 하지만 여전히 DataParallel 들어갈 여지는 없어보인다.
        #-> 아.. state 가 벡터 1개가 아닐 것 같다.. 16개네. gpu 쓸만하다. DataParallel 도 가능할듯?

        state = preprocess_batch(state)

        dist, value = model(state)
        # print(value, value.shape , "value")
        # m = Categorical(dist)

        action = dist.sample()

        next_state, reward, done, _ = envs.step(
            action.cpu().numpy())  #done 한다고 끝내질 않네??

        # logging.warning(f'dist[action] : {dist[action]}')
        # print(action)
        log_prob = dist.log_prob(action)  #torch.log(dist[action])
        log_probs.append(log_prob)
        values.append(value)