Beispiel #1
0
def get_trajectories(batch_size=32,
                     timesteps=10,
                     policy='random',
                     random_start=False,
                     training=True):
    envs = MultiEnvironment([Env() for _ in range(batch_size)])
    t_states, t_rewards, t_dones, t_actions = [], [], [], []
    # Initial actions/stats
    actions = np.random.randint(envs.action_space.n, size=(batch_size, ))
    for t in range(timesteps):
        states, rewards, dones, _ = envs.step(actions)
        rewards = [rewards]
        if policy == 'random':
            actions = np.random.randint(envs.action_space.n,
                                        size=(batch_size, ))
        if policy == 'repeat':
            actions = [i % envs.action_space.n for i in range(batch_size)]
        t_states.append(states)
        t_rewards.append(rewards)
        t_dones.append(dones)
        t_actions.append(actions)
    # Reshape to (batch_size, timesteps, ...)
    states = np.swapaxes(t_states, 0, 1)
    rewards = np.swapaxes(t_rewards, 0, 1)
    dones = np.swapaxes(t_dones, 0, 1)
    actions = np.swapaxes(t_actions, 0, 1)
    return states, rewards, dones, actions
Beispiel #2
0
def get_trajectories(batch_size=32, timesteps=10, policy=None):
    envs = MultiEnvironment([BoxesEnv() for _ in range(batch_size)])
    t_states, t_rewards, t_dones, t_actions = [], [], [], []
    for t in range(timesteps):
        actions = np.random.randint(envs.action_space.n, size=(batch_size, ))
        states, rewards, dones, _ = envs.step(actions)
        t_states.append(states)
        t_rewards.append(rewards)
        t_dones.append(dones)
        t_actions.append(actions)
    # Reshape to (batch_size, timesteps, ...)
    states = np.swapaxes(t_states, 0, 1)
    rewards = np.swapaxes(t_rewards, 0, 1)
    dones = np.swapaxes(t_dones, 0, 1)
    actions = np.swapaxes(t_actions, 0, 1)
    return states, rewards, dones, actions
Beispiel #3
0
        self.prev_action = np.random.randint(self.num_actions)

    def step(self, state):
        flip = np.random.random()
        if flip > 0.90:
            action = 1  # mash the 'shoot' button
        elif flip > 0.25:
            action = self.prev_action
            self.prev_action = action
        else:
            action = np.random.randint(self.num_actions)
        return action


MAX_BATCH_SIZE = 32
envs = MultiEnvironment([GameEnv() for _ in range(MAX_BATCH_SIZE)])
states = envs.reset()
def get_trajectories(batch_size=32, timesteps=10, policy=None, random_start=False):
    global states
    actions = np.random.randint(envs.action_space.n, size=(batch_size,))

    PolicyFn = policy or HeuristicPolicy
    policies = [PolicyFn() for _ in range(batch_size)]

    t_states, t_rewards, t_dones, t_actions = [], [], [], []
    for t in range(timesteps):
        actions = [p.step(s) for p, s in zip(policies, states)]
        actions = actions[:batch_size]  # hack for fixed envs w/ varible batch size
        new_states, rewards, dones, _ = envs.step(actions)
        t_states.append(new_states)
        for i in range(batch_size):