Beispiel #1
0

            # caclulate features
            features, next_hidden_state_h, next_hidden_state_c = agent.feature_net(map_state, depth_state, goal_state, hidden_state_h, hidden_state_c)

            # caclulate action and state values
            dist, value, std  = agent.ac_model( features)

            total_std.append(std[1].cpu().numpy())


            action = dist.sample()

            # this is a x,1 tensor is kontains alle the possible actions
            # the cpu command move it from a gpu tensor to a cpu tensor
            next_map_state, next_depth_state, next_goal_state, reward, done, _ = envs.step(action.cpu().numpy())

            # count reached goal and reset stacked frames
            for i in range(0, num_envs):
                if (done[i] == True):
                    number_of_episodes += 1
                    if (reward[i] >= 0.2):
                        number_reached_goal += 1
                        reach_goal.append(1)
                    else:
                        reach_goal.append(0)

                    _, stacked_map_frames = reset_single_frame(stacked_map_frames, next_map_state[i], stack_size, i)
                    _, stacked_depth_frames = reset_single_frame(stacked_depth_frames, next_depth_state[i], stack_size,
                                                                 i)
                    _, stacked_goal_frames = reset_single_frame(stacked_goal_frames, next_goal_state[i], stack_size, i)
Beispiel #2
0
while not early_stop:

    log_probs = []
    values = []
    states = []
    actions = []
    rewards = []
    masks = []

    for _ in range(NB_STEP):
        state = torch.FloatTensor(state)
        value = model.predict_value(state)
        action = model.get_action(state)
        action = action.squeeze(0)
        next_state, reward, done, _ = envs.step(action)
        log_prob = model.get_log_prob(state, action)

        log_probs.append(log_prob)
        values.append(value)
        rewards.append(torch.FloatTensor(reward).unsqueeze(1))
        masks.append(torch.FloatTensor(1 - done).unsqueeze(1))

        states.append(state)
        actions.append(action)

        state = next_state
        frame_idx += 1

        if frame_idx % 1000 == 0:
            test_reward = np.mean([test_env() for _ in range(10)])
Beispiel #3
0
def main():
    mode = "regular"
    num_envs = 16

    def make_env():
        def _thunk():
            env = MiniPacman(mode, 1000)
            return env

        return _thunk

    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)

    state_shape = envs.observation_space.shape

    #a2c hyperparams:
    gamma = 0.99
    entropy_coef = 0.01
    value_loss_coef = 0.5
    max_grad_norm = 0.5
    num_steps = 5
    num_frames = int(10e3)

    #rmsprop hyperparams:
    lr = 7e-4
    eps = 1e-5
    alpha = 0.99

    #Init a2c and rmsprop
    actor_critic = ActorCritic(envs.observation_space.shape,
                               envs.action_space.n)
    optimizer = optim.RMSprop(actor_critic.parameters(),
                              lr,
                              eps=eps,
                              alpha=alpha)

    #if USE_CUDA:
    #    actor_critic = actor_critic.cuda()

    rollout = RolloutStorage(num_steps, num_envs, envs.observation_space.shape)
    #rollout.cuda()

    all_rewards = []
    all_losses = []

    state = envs.reset()
    state = torch.FloatTensor(np.float32(state))

    rollout.states[0].copy_(state)

    episode_rewards = torch.zeros(num_envs, 1)
    final_rewards = torch.zeros(num_envs, 1)

    for i_update in tqdm(range(num_frames)):

        for step in range(num_steps):
            action = actor_critic.act(autograd.Variable(state))

            next_state, reward, done, _ = envs.step(
                action.squeeze(1).cpu().data.numpy())

            reward = torch.FloatTensor(reward).unsqueeze(1)
            episode_rewards += reward
            masks = torch.FloatTensor(1 - np.array(done)).unsqueeze(1)
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            #if USE_CUDA:
            #    masks = masks.cuda()

            state = torch.FloatTensor(np.float32(next_state))
            rollout.insert(step, state, action.data, reward, masks)

        _, next_value = actor_critic(
            autograd.Variable(rollout.states[-1], volatile=True))
        next_value = next_value.data

        returns = rollout.compute_returns(next_value, gamma)

        logit, action_log_probs, values, entropy = actor_critic.evaluate_actions(
            autograd.Variable(rollout.states[:-1]).view(-1, *state_shape),
            autograd.Variable(rollout.actions).view(-1, 1))

        values = values.view(num_steps, num_envs, 1)
        action_log_probs = action_log_probs.view(num_steps, num_envs, 1)
        advantages = autograd.Variable(returns) - values

        value_loss = advantages.pow(2).mean()
        action_loss = -(autograd.Variable(advantages.data) *
                        action_log_probs).mean()

        optimizer.zero_grad()
        loss = value_loss * value_loss_coef + action_loss - entropy * entropy_coef
        loss.backward()
        nn.utils.clip_grad_norm(actor_critic.parameters(), max_grad_norm)
        optimizer.step()

        if i_update % num_frames == 0:
            all_rewards.append(final_rewards.mean())
            all_losses.append(loss.item())

            #clear_output(True)
            plt.figure(figsize=(20, 5))
            plt.subplot(131)
            plt.title('epoch %s. reward: %s' %
                      (i_update, np.mean(all_rewards[-10:])))
            plt.plot(all_rewards)
            plt.subplot(132)
            plt.title('loss %s' % all_losses[-1])
            plt.plot(all_losses)
            plt.show()

        rollout.after_update()

    torch.save(actor_critic.state_dict(), "actor_critic_" + mode)

    import time

    def displayImage(image, step, reward):
        #clear_output(True)
        s = "step: " + str(step) + " reward: " + str(reward)
        plt.figure(figsize=(10, 3))
        plt.title(s)
        plt.imshow(image)
        plt.show()
        time.sleep(0.1)

    env = MiniPacman(mode, 1000)

    done = False
    state = env.reset()
    total_reward = 0
    step = 1

    while not done:
        current_state = torch.FloatTensor(state).unsqueeze(0)
        #if USE_CUDA:
        #    current_state = current_state.cuda()

        action = actor_critic.act(autograd.Variable(current_state))

        next_state, reward, done, _ = env.step(action.data[0, 0])
        total_reward += reward
        state = next_state

        image = torch.FloatTensor(state).permute(1, 2, 0).cpu().numpy()
        displayImage(image, step, total_reward)
        step += 1
Beispiel #4
0
if __name__ == '__main__':
    envs = [make_env for i in range(N_ENVS)]
    envs = SubprocVecEnv(envs)
    obs = envs.reset()
    print("OBSERVATION ", obs[0])
    obs = obs.reshape(-1)
    obs_shape = obs.shape
    envs = VecNormalize(envs, obs_shape, ob=False, gamma=0.99)

    n_steps = 100
    bar = ProgBar(n_steps, bar_char='█')
    for i_episode in range(2):
        ## reinitialize the environment
        observation = envs.reset()
        ## the simulation for n_steps timesteps
        for t in range(n_steps):
            ##  value, is_rate, is_producer, is_open
            actions_inje = [[randint(410, 430), False, False, True]
                            for _ in range(8)]
            actions_prod = [[randint(220, 250), False, True, True]
                            for _ in range(4)]
            ## Advance the simulation forward
            observation, reward, done, observation_full = \
                envs.step([(actions_inje + actions_prod) for _ in range(N_ENVS)])
            # print (reward)
            bar.update()
            if done.any():
                print("Episode finished after {} timesteps".format(t + 1))
                break
    envs.close()
class RolloutCollector:
    def __init__(self, num_env_workers, make_env_func, agent, batch_size,
                 rollout_length, num_recurrence_steps, state_shape,
                 action_shape, stats):
        ''' -one agent is assigned to a collector. 
            -a collector runs a bunch of envs in paralel to feed to that agent
            -you could run a bunch of collectors simultaniously, 
                |-  and then use weight mixing on the agents seperately
        '''
        self.num_env_workers = num_env_workers
        self.envs = SubprocVecEnv(
            [make_env_func() for i in range(num_env_workers)])
        self.agent = agent
        self.batch_size = batch_size
        self.rollout_length = rollout_length
        self.num_recurrence_steps = num_recurrence_steps
        self.state_shape = state_shape
        self.action_shape = action_shape
        self.stats = stats

        self.buffer_full = False
        self.GAE_calculated = False

        self.gamma = 0.8
        self.tau = 0.8

        self.rollout_indices = np.zeros(batch_size)
        self.buffer_width = self.rollout_length + self.num_recurrence_steps - 1
        self.states = torch.zeros(
            (batch_size, self.buffer_width + 1, *state_shape),
            dtype=torch.float32).to(self.agent.device)
        self.actions = torch.zeros(
            (batch_size, self.buffer_width + 1, *action_shape),
            dtype=torch.float32).to(self.agent.device)
        self.log_probs = torch.zeros(
            (batch_size, self.buffer_width + 1, *action_shape),
            dtype=torch.float32).to(self.agent.device)
        self.values = torch.zeros((batch_size, self.buffer_width + 1, 1),
                                  dtype=torch.float32).to(self.agent.device)
        self.rewards = torch.zeros((batch_size, self.buffer_width + 1, 1),
                                   dtype=torch.float32).to(self.agent.device)
        self.done_masks = torch.zeros(
            (batch_size, self.buffer_width + 1, 1),
            dtype=torch.float32).to(self.agent.device)
        self.advantages = torch.zeros(
            (batch_size, self.buffer_width + 1, 1),
            dtype=torch.float32).to(self.agent.device)
        self.returns = torch.zeros((batch_size, self.buffer_width + 1, 1),
                                   dtype=torch.float32).to(self.agent.device)

        self.state = self.envs.reset()
        self.hidden_state = torch.zeros(
            (1, self.num_env_workers,
             self.agent.hidden_state_size)).to(self.agent.device)
        self.cell_state = torch.zeros(
            (1, self.num_env_workers,
             self.agent.hidden_state_size)).to(self.agent.device)

    def collect_samples(self):
        if self.buffer_full:
            raise Exception(
                "tried to collect more samples when buffer already full")
        num_runs_to_full = math.ceil(self.batch_size / self.num_env_workers)
        with torch.no_grad():

            self.hidden_state = torch.zeros(
                (1, self.num_env_workers,
                 self.agent.hidden_state_size)).to(self.agent.device)
            self.cell_state = torch.zeros(
                (1, self.num_env_workers,
                 self.agent.hidden_state_size)).to(self.agent.device)

            for collection_run in range(num_runs_to_full):
                start_index = collection_run * self.num_env_workers
                end_index_exclusive = min(start_index + self.num_env_workers,
                                          self.batch_size)
                run_indices = torch.arange(start_index,
                                           end_index_exclusive,
                                           dtype=torch.long)
                worker_indices = run_indices % self.num_env_workers

                for rollout_idx in range(self.buffer_width + 1):
                    state = torch.Tensor(self.state).float().to(
                        self.agent.device)

                    #   for recurrences
                    lstm_input = state.view(-1, 1, *self.state_shape)
                    output, (hidden, cell) = self.agent.lstm(
                        lstm_input, (self.hidden_state, self.cell_state))
                    output = output.reshape(self.num_env_workers,
                                            self.agent.hidden_state_size)

                    policy_dist = self.agent.actor(output)
                    action = policy_dist.sample()
                    action = action.clamp(-1, 1)  #   depends on env
                    state_, reward, done, info = self.envs.step(
                        action.cpu().numpy())

                    value = self.agent.critic(output)
                    log_prob = policy_dist.log_prob(action)

                    reward = torch.Tensor(reward).float().unsqueeze(1).to(
                        self.agent.device)
                    done_masks = torch.Tensor(1.0 -
                                              done).float().unsqueeze(1).to(
                                                  self.agent.device)

                    self.states[run_indices,
                                rollout_idx] = state[worker_indices]
                    self.actions[run_indices,
                                 rollout_idx] = action[worker_indices]
                    self.log_probs[run_indices,
                                   rollout_idx] = log_prob[worker_indices]
                    self.values[run_indices,
                                rollout_idx] = value[worker_indices]
                    self.rewards[run_indices,
                                 rollout_idx] = reward[worker_indices]
                    self.done_masks[run_indices,
                                    rollout_idx] = done_masks[worker_indices]

                    self.hidden_state[0, worker_indices] *= self.done_masks[
                        run_indices,
                        rollout_idx].expand(-1, self.agent.hidden_state_size)
                    self.cell_state[0, worker_indices] *= self.done_masks[
                        run_indices,
                        rollout_idx].expand(-1, self.agent.hidden_state_size)
                    self.state = state_

        self.buffer_full = True
        self.stats.update_collection_stats(
            num_samples_collected_inc=self.batch_size * self.rollout_length)

    def compute_gae(self):
        if not self.buffer_full:
            raise Exception(
                "buffer is not full of new samples yet (so not ready for GAE)")

        gae = torch.zeros((self.batch_size, 1)).to(self.agent.device)
        for i in reversed(range(self.buffer_width)):
            delta = self.rewards[:,
                                 i] + self.gamma * self.values[:, i +
                                                               1] * self.done_masks[:,
                                                                                    i] - self.values[:,
                                                                                                     i]
            gae = delta + self.gamma * self.tau * self.done_masks[:, i] * gae
            self.returns[:, i] = gae + self.values[:, i]
            self.advantages[:, i] = gae

        self.GAE_calculated = True

    def get_leading_states(self, index):
        indices_with_leading_states = torch.arange(
            self.num_recurrence_steps) - self.num_recurrence_steps + 1 + index
        leading_states = self.states[:, indices_with_leading_states]

        #   some of the leading states might be from previous episodes
        #   #   in which case, we dont want to consider those at all.
        leading_state_indices = indices_with_leading_states[:-1]
        leading_dones = 1 - self.done_masks[:, leading_state_indices]
        last_leading_dones = leading_dones.nonzero()[:, :2]
        for batch_index, last_done in last_leading_dones:
            previous_episode_indices = torch.arange(last_done + 1)
            leading_states[batch_index, previous_episode_indices] = 0

        return leading_states

    def random_batch_iter(self):
        if not self.buffer_full and not self.GAE_calculated:
            raise Exception(
                "buffer is not ready for sampling yet. (not full/no GAE)")
        '''-theres no way all the workers are aligned, especially after an episode or so. 
            so we might just be able to use a vertical index'''
        batch_indices = torch.randperm(self.rollout_length)

        #   recurrence stuff
        if self.num_recurrence_steps > 0:
            batch_indices = torch.randperm(
                self.rollout_length) + self.num_recurrence_steps - 1
            self.hidden_state = torch.zeros(
                (1, self.batch_size,
                 self.agent.hidden_state_size)).to(self.agent.device)
            self.cell_state = torch.zeros(
                (1, self.batch_size,
                 self.agent.hidden_state_size)).to(self.agent.device)

        for i in range(self.rollout_length):
            index = batch_indices[i]
            leading_states = self.get_leading_states(index)
            output, (hidden, cell) = self.agent.lstm(
                leading_states, (self.hidden_state, self.cell_state))
            state = output[:, -1, :]

            action = self.actions[:, index]
            log_prob = self.log_probs[:, index]
            advantage = self.advantages[:, index]
            return_ = self.returns[:, index]
            yield state, action, log_prob, advantage, return_

    def reset(self):
        self.buffer_full = False
        self.GAE_calculated = False
        print("=> loaded checkpoint '{}' (global_t {})"
            .format(best_path, checkpoint['global_t']))
    else:
        global_t=0
        print("=> no checkpoint found at '{}'".format(best_path))
    count=0
    running_corrects=0
    running_corrects1 =0
    writer=SummaryWriter()
    for i_update in range(num_frames):
        optimizer.zero_grad()
        for step in range(num_steps):
            action= actor_critic.act(Variable(pstate1),Variable(pstate2),Variable(pstate3),Variable(state),Variable(gstate),Variable(pre_action))
            #print("act:",action)
            pim1,pim2,pim3,next_state,g_state, reward, done,gt_action,gt_state, shortest,pre_action = envs.step(action.cpu().data.numpy())
            for i in range(num_envs):
                my_path[i][-1]+=1
                if reward[i]>5:
                    episode_success[i][-1]+=1
                if done[i]:
                    shortest_path[i].append(shortest[i])
                    episode_success[i].append(0)
                    my_path[i].append(1)
            #=====================================
            reward = torch.FloatTensor(reward).unsqueeze(1)
            episode_rewards += reward
            masks = torch.FloatTensor(1-np.array(done)).unsqueeze(1)
            count+=(1-masks).sum()
            #final_rewards *= masks
            final_rewards += (1-masks) * episode_rewards
Beispiel #7
0
                              action_bound=None,
                              rollout_steps=ROLLOUT_STEPS,
                              memory_capacity=4096,
                              summary_writer=None,
                              mode=0)

    states = envs.reset()
    states = [utils.combine_env_states(*state) for state in states]
    padded_states = np.array(
        [utils.pad_data(state, MAX_NUM_NODES, [1]) for state in states])

    for step in range(PPO_STEPS):
        if (step + 1) % 10 == 0:
            print('Step', step)

        padded_actions = swarmnet_agent.act_batch(
            [padded_states, padded_edge_types], masks)[0]
        next_states, rewards, dones, infos = envs.step([
            padded_action[-num_boid:, :]
            for padded_action, num_boid in zip(padded_actions, env_num_boids)
        ])

        padded_states = np.array([
            utils.pad_data(utils.combine_env_states(*state), MAX_NUM_NODES,
                           [1]) for state in next_states
        ])

    end_t = time.time()

    print("Time spent", end_t - start_t)
Beispiel #8
0
class PPO(object):
    """Main PPO class"""
    def __init__(self, args):
        """"Constructor which allows the PPO class to initialize the attributes of the class"""
        self.args = args
        self.random_seed()
        # Check if GPU is available via CUDA driver
        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        # Initialize the actor critic class
        self.actor_critic = ActorCritic(
            self.args.nb_states, self.args.nb_actions,
            self.args.hidden_layer_size).to(self.device)
        # Define the optimizer used for the optimization of the surrogate loss
        self.optimizer = self.args.optimizer(self.actor_critic.parameters(),
                                             self.args.lr)

        # For training multiple instances of the env are needed (Shoulder model)
        self.envs = [self.make_env() for i in range(self.args.num_envs)]
        self.envs = SubprocVecEnv(self.envs)
        # To validate the intermediate learning process one test env is needed
        self.env_test = self.args.env
        self.env_test.seed(self.args.seed)
        self.env_test.set_scaling(self.args.output_scaling)

        #  Lists for Tensorboard to visualize learning process during learning
        self.test_rewards = []
        self.loss = []
        self.lr = []
        self.actor_grad_weight = []
        self.action_bang_bang = []

        self.lr.append(self.args.lr)

        # Dump bin files
        if self.args.play is False:
            self.output_path = "trained_models" + '/PPO_{}'.format(
                datetime.now().strftime('%Y%b%d_%H%M%S')) + "/"
            os.mkdir(self.output_path)
            self.writer = SummaryWriter(self.output_path)

        #self.delta = (self.args.lr-self.args.lr_end)/1e6

    def train(self):
        """Main training function"""
        frame_idx = 0
        state = self.envs.reset()
        mean_100_reward = -np.inf
        self.info()

        while frame_idx < self.args.max_frames:
            log_probs = []
            values = []
            states = []
            actions = []
            rewards = []
            masks = []
            entropy = self.args.entropy

            for _ in range(self.args.nb_steps):
                state = torch.FloatTensor(state).to(self.device)
                dist, value = self.actor_critic(state)
                action = dist.sample()
                # Make sure action is loaded to CPU (not GPU)
                next_state, reward, done, _ = self.envs.step(
                    action.cpu().numpy())

                log_prob = dist.log_prob(action)
                entropy += dist.entropy().mean()

                log_probs.append(log_prob)
                values.append(value)
                rewards.append(
                    torch.FloatTensor(reward).unsqueeze(1).to(self.device))
                masks.append(
                    torch.FloatTensor(1 - done).unsqueeze(1).to(self.device))

                states.append(state)
                actions.append(action)
                state = next_state
                frame_idx += 1
                #self.scheduler()

                # Evaluate training process and write data to tensorboard
                if frame_idx % 1000 == 0:
                    test_reward = np.mean(
                        [self.test_env(self.args.vis) for _ in range(10)])
                    self.test_rewards.append(test_reward)

                    if self.args.play is False:
                        print("Mean reward: ",
                              np.round(np.mean(self.test_rewards[-101:-1]), 0))
                        if mean_100_reward < np.round(
                                np.mean(self.test_rewards[-101:-1]), 0):
                            mean_100_reward = np.round(
                                np.mean(self.test_rewards[-101:-1]), 0)
                            self.save_network(mean_100_reward)
                        if len(self.test_rewards) >= 10:
                            self.writer.add_scalar(
                                'data/reward',
                                np.mean(self.test_rewards[-11:-1]),
                                frame_idx * self.args.num_envs)
                            self.writer.add_scalar(
                                'data/ppo_loss', np.mean(self.loss[-11:-1]),
                                frame_idx * self.args.num_envs)
                            self.writer.add_scalar(
                                'data/nb_actions_outside_range',
                                np.mean(self.action_bang_bang[-11:-1]),
                                frame_idx * self.args.num_envs)

                    # if test_reward > threshold_reward: early_stop = True

            next_state = torch.FloatTensor(next_state).to(self.device)
            _, next_value = self.actor_critic(next_state)
            returns = self.calc_gae(next_value, rewards, masks, values,
                                    self.args.gamma, self.args.tau)

            # detach() to take it away from the graph i.e. this operations are ignored for gradient calculations
            returns = torch.cat(returns).detach()
            log_probs = torch.cat(log_probs).detach()
            values = torch.cat(values).detach()
            states = torch.cat(states)
            actions = torch.cat(actions)
            advantage = returns - values
            self.ppo_update(self.args.ppo_epochs, self.args.mini_batch_size,
                            states, actions, log_probs, returns, advantage,
                            self.args.clip)

    def make_env(self):
        # Private trunk function for calling the SubprocVecEnv class
        def _trunk():
            env = self.args.env  # in this simple case the class TestEnv() is called (see openAI for more envs)
            env.seed(self.args.seed)
            env.set_scaling(self.args.output_scaling)
            return env

        return _trunk

    def test_env(self, vis=False):
        state = self.env_test.reset()
        if vis:
            self.env_test.render()
        done = False
        total_reward = 0
        action_bang_bang = 0
        step = 0
        while not done:
            step += 1
            state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
            dist, _ = self.actor_critic(state)
            action = dist.sample().cpu().numpy()[0]
            force = action * self.args.output_scaling
            next_state, reward, done, _ = self.env_test.step(action)
            if force > 0.5 or force < -0.5:
                action_bang_bang += 1
            state = next_state
            if vis:
                self.env_test.render()
            total_reward += reward
        self.action_bang_bang.append(action_bang_bang / step)
        return total_reward

    # Plain functions except that one can call them from an instance or the class
    @staticmethod
    def calc_gae(next_value, rewards, masks, values, gamma=0.99, tau=0.95):
        values = values + [next_value]
        gae = 0
        returns = []
        for step in reversed(range(len(rewards))):
            delta = rewards[step] + gamma * values[
                step + 1] * masks[step] - values[step]
            gae = delta + gamma * tau * masks[step] * gae
            returns.insert(0, gae + values[step])
        return returns

    @staticmethod
    def ppo_iter(mini_batch_size, states, actions, log_probs, returns,
                 advantage):
        batch_size = states.size(0)
        for _ in range(batch_size // mini_batch_size):
            rand_ids = np.random.randint(0, batch_size, mini_batch_size)
            yield states[rand_ids, :], actions[rand_ids, :], log_probs[
                rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :]

    def ppo_update(self,
                   ppo_epochs,
                   mini_batch_size,
                   states,
                   actions,
                   log_probs,
                   returns,
                   advantages,
                   clip_param=0.2):
        for _ in range(ppo_epochs):
            for state, action, old_log_probs, return_, advantage in self.ppo_iter(
                    mini_batch_size, states, actions, log_probs, returns,
                    advantages):
                dist, value = self.actor_critic(state)
                entropy = dist.entropy().mean()
                new_log_probs = dist.log_prob(action)

                ratio = (new_log_probs - old_log_probs).exp()
                surr1 = ratio * advantage
                surr2 = torch.clamp(ratio, 1.0 - clip_param,
                                    1.0 + clip_param) * advantage

                actor_loss = -torch.min(surr1, surr2).mean()
                critic_loss = (return_ - value).pow(2).mean()

                loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy
                self.loss.append(loss.item())
                # Important step:
                self.optimizer.zero_grad()
                #pdb.set_trace()
                loss.backward()
                if self.args.grad_norm is not None:
                    nn.utils.clip_grad_norm_(self.actor_critic.parameters(),
                                             self.args.grad_norm)
                self.optimizer.step()

    def save_network(self, reward):
        network_path = self.output_path + "/network" + str(reward)
        pickle.dump(self.actor_critic.state_dict(), open(network_path, "wb"))

    def load_network(self, path):
        network_new = pickle.load(open(path, "rb"))
        self.actor_critic.load_state_dict(network_new)

    def random_seed(self):
        torch.manual_seed(self.args.seed)
        random.seed(self.args.seed)
        np.random.seed(self.args.seed)

    def scheduler(self):
        for g in self.optimizer.param_groups:
            lr = g["lr"]
            if self.args.lr_end > lr:
                lr = self.args.lr_end
            else:
                lr -= self.delta
            self.lr.append(lr)
            g["lr"] = lr

    def info(self):
        fhandler = logging.FileHandler(filename=self.output_path +
                                       '/mylog.log',
                                       mode='a')
        logger.addHandler(fhandler)
        logger.info("--- INFO ---")
        logger.info("args: {}".format(self.args))
sess.run(tf.global_variables_initializer())

while frame_idx < max_frames and not early_stop:

    log_probs = []
    values = []
    obs = []
    acs = []
    rewards = []
    masks = []
    entropy = 0

    for _ in range(num_steps):

        ac = ppo.get_action(ob)
        next_ob, reward, done, _ = envs.step(ac)

        value = ppo.get_value(ob)
        values.append(value)
        rewards.append(reward[:, np.newaxis])
        masks.append((1 - done)[:, np.newaxis])

        obs.append(ob)
        acs.append(ac)

        ob = next_ob
        frame_idx += 1

        if frame_idx % 1000 == 0:
            test_reward = np.mean([test_env(ppo) for _ in range(10)])
            test_rewards.append(test_reward)
def main():
    num_envs = 16
    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)
    env = gym.make("CartPole-v0")

    num_inputs = envs.observation_space.shape[0]
    num_outputs = envs.action_space.n
    # Hyper params:
    hidden_size = 256
    lr = 3e-4
    num_steps = 5

    model = ActorCritic(num_inputs,num_outputs,hidden_size).to(device)

    optimizer = optim.Adam(model.parameters())

    max_frames = 20000
    frame_idx = 0
    test_rewards = []
    state = envs.reset()

    while frame_idx < max_frames:

        log_probs = []
        values = []
        rewards = []
        masks = []
        entropy = 0

        #每个子网络运行num_steps个steps,实现n步采样
        for _ in range(num_steps):
            state = torch.FloatTensor(state).to(device)
            dist, value = model(state)
            action = dist.sample()
            next_state, reward, done, _ = envs.step(action.cpu().numpy())
            log_prob = dist.log_prob(action)
            entropy += dist.entropy().mean()

            #记录下这num_steps步的各子网络相关参数
            log_probs.append(log_prob)
            values.append(value)
            rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
            masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))

            state = next_state
            frame_idx += 1

            if frame_idx % 100 == 0:
                test_rewards.append(np.mean([test_env(model, env) for _ in range(10)]))
                plot(frame_idx, test_rewards)

        #将子网络的参数传给主网络,并进行参数更新
        next_state = torch.FloatTensor(next_state).to(device)
        _, next_value = model(next_state)
        returns = compute_returns(next_value, rewards, masks)

        #将5个step的值串起来
        log_probs = torch.cat(log_probs)
        returns = torch.cat(returns).detach()
        values = torch.cat(values)

        advantage = returns - values
        #计算loss均值
        actor_loss = -(log_probs * advantage.detach()).mean()
        critic_loss = advantage.pow(2).mean()

        loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
Beispiel #11
0
def train(env, agent, flags):
    """"""

    # set random seeds (for reproducibility)
    torch.manual_seed(flags['seed'])
    torch.cuda.manual_seed_all(flags['seed'])
    envs = [make_env(flags['env'], flags['seed'], i) for i in range(flags['num_envs'])]
    envs = SubprocVecEnv(envs)

    # instantiate the policy and optimiser
    num_inputs  = envs.observation_space.shape[0]
    num_outputs = envs.action_space.n
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    current_step_number = 0
    test_rewards = []
    state = envs.reset()

    
    while current_step_number < flags['max_steps']:
        
        log_probs = []
        values    = []
        rewards   = []
        masks     = []
        entropy = 0

        for _ in range(flags['num_step_td_update']):

            # sample an action from the distribution
            action = agent.act(state)
            # take a step in the environment
            next_state, reward, done, _ = envs.step(action.cpu().numpy())
                
            # compute the log probability
            log_prob = dist.log_prob(action)
            # compute the entropy
            entropy += dist.entropy().mean()
            
            # save the log probability, value and reward 
            log_probs.append(log_prob)
            values.append(value)
            rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
            masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))



            # if done, save episode rewards

            state = next_state
            current_step_number += 1
            
            if current_step_number % 1000 and flags['plot_test'] == 0:
                test_rewards.append(np.mean([test_env(model) for _ in range(10)]))
                plot(current_step_number, test_rewards)

        next_state = torch.FloatTensor(next_state).to(device)
        _, next_value = model(next_state)
   
        # calculate the discounted return of the episode
        returns = compute_returns(next_value, rewards, masks)

        log_probs = torch.cat(log_probs)
        returns   = torch.cat(returns).detach()
        values    = torch.cat(values)

        advantage = returns - values

        actor_loss  = -(log_probs * advantage.detach()).mean()
        critic_loss = advantage.pow(2).mean()

        # loss function
        loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    return rewards
Beispiel #12
0
def dqn_algorithm(ENV_NAME,
                  NUM_ENV=8,
                  SEED=1,
                  TOTAL_TIMESTEPS=100000,
                  GAMMA=0.95,
                  MEMORY_SIZE=1000,
                  BATCH_SIZE=32,
                  EXPLORATION_MAX=1.0,
                  EXPLORATION_MIN=0.02,
                  EXPLORATION_FRACTION=0.7,
                  TRAINING_FREQUENCY=1000,
                  FILE_PATH='results/',
                  SAVE_MODEL=False,
                  MODEL_FILE_NAME='model',
                  LOG_FILE_NAME='log',
                  TIME_FILE_NAME='time',
                  PRINT_FREQ=100,
                  N_EP_AVG=100,
                  VERBOSE='False',
                  MLP_LAYERS=[64, 64],
                  MLP_ACTIVATIONS=['relu', 'relu'],
                  LEARNING_RATE=1e-3,
                  EPOCHS=1,
                  GRAD_CLIP=False,
                  DOUBLE_DQN=False,
                  USE_TARGET_NETWORK=True,
                  TARGET_UPDATE_FREQUENCY=5000,
                  LOAD_WEIGHTS=False,
                  LOAD_WEIGHTS_MODEL_PATH='results/model0.h5'):
    '''
    DQN Algorithm execution

    env_name : string for a gym environment
    num_env : no. for environment vectorization (multiprocessing env)
    total_timesteps : Total number of timesteps
    training_frequency : frequency of training (experience replay)
    gamma : discount factor : 
    buffer_size : Replay buffer size 
    batch_size : batch size for experience replay 
    exploration_max : maximum exploration at the begining 
    exploration_min : minimum exploration at the end 
    exploration_fraction : fraction of total timesteps on which the exploration decay takes place 
    output_folder : output filepath 
    save_model : boolean to specify whether the model is to be saved 
    model_file_name : name of file to save the model at the end learning 
    log_file_name : name of file to store DQN results 
    time_file_name : name of file to store computation time 
    print_frequency : results printing episodic frequency 
    n_ep_avg : no. of episodes to be considered while computing average reward 
    verbose : print episodic results 
    mlp_layers : list of neurons in each hodden layer of the DQN network 
    mlp_activations : list of activation functions in each hodden layer of the DQN network 
    learning_rate : learning rate for the neural network 
    epochs : no. of epochs in every experience replay 
    grad_clip : boolean to specify whether to use gradient clipping in the optimizer (graclip value 10.0) 
    double_dqn : boolean to specify whether to employ double DQN 
    use_target_network : boolean to use target neural network in DQN 
    target_update_frequency : timesteps frequency to do weight update from online network to target network 
    load_weights : boolean to specify whether to use a prespecified model to initializa the weights of neural network 
    load_weights_model_path : path for the model to use for weight initialization 
    '''

    before = time.time()
    num_envs = NUM_ENV
    env_name = ENV_NAME

    if TOTAL_TIMESTEPS % NUM_ENV:
        print('Error: total timesteps is not divisible by no. of envs')
        return

    def make_env():
        def _thunk():
            env = gym.make(env_name)
            env.seed(SEED)
            return env

        return _thunk

    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)

    # for reproducibility
    set_seed(SEED)

    observation_space = envs.observation_space.shape[0]
    action_space = envs.action_space.n

    dqn_solver = DQNSolver(observation_space, action_space, MLP_LAYERS,
                           MLP_ACTIVATIONS, LEARNING_RATE, EPOCHS,
                           USE_TARGET_NETWORK, GRAD_CLIP, DOUBLE_DQN,
                           LOAD_WEIGHTS, LOAD_WEIGHTS_MODEL_PATH,
                           TOTAL_TIMESTEPS, MEMORY_SIZE, BATCH_SIZE, GAMMA,
                           EXPLORATION_MAX, EXPLORATION_MIN,
                           EXPLORATION_FRACTION)

    envs = ParallelEnvWrapper(envs)
    t = 0
    episode_rewards = [0.0] * num_envs
    explore_percent, episodes, mean100_rew, steps, NN_tr_loss = [],[],[],[],[]
    while True:
        state = envs.reset()
        # state = np.reshape(state, [1, observation_space])
        while True:
            t += num_envs
            dqn_solver.eps_timestep_decay(t)
            action = dqn_solver.act(state)
            state_next, reward, terminal, _ = envs.step(action)
            # print(terminal)
            # reward = reward if not terminal else -reward
            # state_next = np.reshape(state_next, [1, observation_space])
            dqn_solver.remember(state, action, reward, state_next, terminal)
            if t % TRAINING_FREQUENCY == 0:
                dqn_solver.experience_replay()
            state = state_next
            episode_rewards[-num_envs:] = [
                i + j for (i, j) in zip(episode_rewards[-num_envs:], reward)
            ]
            # num_episodes = len(episode_rewards)
            # print(terminal)
            if (t % PRINT_FREQ == 0):
                explore_percent.append(dqn_solver.exploration_rate * 100)
                episodes.append(len(episode_rewards))
                mean100_rew.append(
                    round(np.mean(episode_rewards[(-1 - N_EP_AVG):-1]), 1))
                steps.append(t)
                NN_tr_loss.append(dqn_solver.loss)
                if VERBOSE:
                    print('Exploration %: ' + str(int(explore_percent[-1])) +
                          ' ,Episodes: ' + str(episodes[-1]) +
                          ' ,Mean_reward: ' + str(mean100_rew[-1]) +
                          ' ,timestep: ' + str(t) + ' , tr_loss: ' +
                          str(round(NN_tr_loss[-1], 4)))

            if t > TOTAL_TIMESTEPS:
                output_table = np.stack((steps, mean100_rew, episodes,
                                         explore_percent, NN_tr_loss))
                if not os.path.exists(FILE_PATH):
                    os.makedirs(FILE_PATH)
                file_name = str(FILE_PATH) + LOG_FILE_NAME + '.csv'
                np.savetxt(
                    file_name,
                    np.transpose(output_table),
                    delimiter=',',
                    header=
                    'Timestep,Rewards,Episodes,Exploration %,Training Score')
                after = time.time()
                time_taken = after - before
                np.save(str(FILE_PATH) + TIME_FILE_NAME, time_taken)
                if SAVE_MODEL:
                    file_name = str(FILE_PATH) + MODEL_FILE_NAME + '.h5'
                    dqn_solver.model.save(file_name)
                return dqn_solver.model
            if USE_TARGET_NETWORK and t % TARGET_UPDATE_FREQUENCY == 0:
                dqn_solver.update_target_network()
            # print(t)
            if terminal.all():
                episode_rewards += [0.0] * num_envs
                break
Beispiel #13
0
n_updates = int(N_FRAMES // N_STEPS // N_ENVS)
for update_i in tqdm(range(n_updates)):
    # Generate samples
    for step in range(N_STEPS):
        # Generate and take an action
        with torch.no_grad():
            value, action, action_log_prob = policy.act(
                rollouts.observations[step])

        take_actions = action.squeeze(1).cpu().numpy()

        if len(take_actions.shape) == 1:
            take_actions = np.expand_dims(take_actions, axis=-1)

        obs, reward, done, info = envs.step(take_actions)

        # convert to pytorch tensor
        reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
        masks = torch.FloatTensor([[0.0] if d else [1.0] for d in done])

        # update reward info for logging
        episode_rewards += reward
        final_rewards *= masks
        final_rewards += (1 - masks) * episode_rewards
        episode_rewards *= masks

        # Update our current observation tensor
        current_obs *= masks
        update_current_obs(obs)
Beispiel #14
0
    while frame_idx < max_frames:

        log_probs = []
        values = []
        states = []
        actions = []
        rewards = []
        masks = []
        entropy = 0

        for _ in range(num_steps):
            state = torch.FloatTensor(state).to(device)
            dist, value = model(state)

            action = dist.sample()
            next_state, reward, done, _ = envs.step(
                np.clip(action.cpu().numpy(), 0, 1))

            log_prob = dist.log_prob(action)
            entropy += dist.entropy().mean()

            log_probs.append(log_prob)
            values.append(value)
            rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
            masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))

            states.append(state)
            actions.append(action)

            state = next_state
            frame_idx += 1
early_stop = False

while frame_idx < max_frames and not early_stop:
    i_update += 1
    
    values    = []
    obs    = []
    acs   = []
    rewards   = []
    masks     = []
    entropy = 0

    for _ in range(num_steps):

        ac = ppo.get_action(ob)
        next_ob, _, done, _ = envs.step(ac)
        reward = discriminator.get_reward(np.concatenate([ob, ac], axis=1))
        
        value = ppo.get_value(ob)
        values.append(value)
        rewards.append(reward[:, np.newaxis])
        masks.append((1-done)[:, np.newaxis])

        obs.append(ob)
        acs.append(ac)

        ob = next_ob
        frame_idx += 1

        if frame_idx % 1000 == 0:
            test_reward = np.mean([test_env(ppo) for _ in range(10)])
Beispiel #16
0
	while frame_idx < max_frames:

		log_probs = []
		values    = []
		states    = []
		actions   = []
		rewards   = []
		masks     = []
		entropy = 0

		for _ in range(num_steps):
			state = torch.FloatTensor(state).to(device)
			dist, value = model(state)

			action = dist.sample()
			next_state, reward, done, _ = envs.step(np.clip(action.cpu().numpy(), 0, 1))

			log_prob = dist.log_prob(action)
			entropy += dist.entropy().mean()
			
			log_probs.append(log_prob)
			values.append(value)
			rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
			masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))
			
			states.append(state)
			actions.append(action)
			
			state = next_state
			frame_idx += 1
			
Beispiel #17
0
    while not early_stop:

        log_probs = []
        values    = []
        states    = []
        actions   = []
        rewards   = []
        masks     = []

        for _ in tqdm(range(PPO_STEPS), ascii=True):
            state = torch.FloatTensor(state).permute(0, 3, 1, 2).to(device)
            dist, value = model(state)
            
            action = dist.sample()
            # each state, reward, done is a list of results from each parallel environment
            next_state, reward, done, _ = envs.step(torch.argmax(action, dim=1, keepdim=True).cpu().numpy())
            log_prob = dist.log_prob(action)
            
            log_probs.append(log_prob)
            values.append(value)
            rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
            masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))
            
            states.append(state)
            actions.append(action)
            
            state = next_state
            frame_idx += 1
                
        next_state = torch.FloatTensor(next_state).permute(0, 3, 1, 2).to(device)
        _, next_value = model(next_state)
Beispiel #18
0
def main():

    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)

    state_shape = envs.observation_space.shape
    num_actions = envs.action_space.n
    num_rewards = len(task_rewards[mode])

    full_rollout = True

    env_model = EnvModel(envs.observation_space.shape, num_pixels, num_rewards)
    env_model.load_state_dict(torch.load("env_model_" + mode))

    distil_policy = ActorCritic(envs.observation_space.shape,
                                envs.action_space.n)
    distil_optimizer = optim.Adam(distil_policy.parameters())

    imagination = ImaginationCore(1,
                                  state_shape,
                                  num_actions,
                                  num_rewards,
                                  env_model,
                                  distil_policy,
                                  full_rollout=full_rollout)

    actor_critic = I2A(state_shape,
                       num_actions,
                       num_rewards,
                       256,
                       imagination,
                       full_rollout=full_rollout)
    #rmsprop hyperparams:
    lr = 7e-4
    eps = 1e-5
    alpha = 0.99
    optimizer = optim.RMSprop(actor_critic.parameters(),
                              lr,
                              eps=eps,
                              alpha=alpha)

    #if USE_CUDA:
    #    env_model     = env_model.cuda()
    #    distil_policy = distil_policy.cuda()
    #    actor_critic  = actor_critic.cuda()

    gamma = 0.99
    entropy_coef = 0.01
    value_loss_coef = 0.5
    max_grad_norm = 0.5
    num_steps = 5
    num_frames = int(10e5)

    rollout = RolloutStorage(num_steps, num_envs, envs.observation_space.shape)
    #rollout.cuda()

    all_rewards = []
    all_losses = []

    state = envs.reset()
    current_state = torch.FloatTensor(np.float32(state))

    rollout.states[0].copy_(current_state)

    episode_rewards = torch.zeros(num_envs, 1)
    final_rewards = torch.zeros(num_envs, 1)

    for i_update in tqdm(range(num_frames)):

        for step in range(num_steps):
            #if USE_CUDA:
            #    current_state = current_state.cuda()
            action = actor_critic.act(autograd.Variable(current_state))

            next_state, reward, done, _ = envs.step(
                action.squeeze(1).cpu().data.numpy())

            reward = torch.FloatTensor(reward).unsqueeze(1)
            episode_rewards += reward
            masks = torch.FloatTensor(1 - np.array(done)).unsqueeze(1)
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            #if USE_CUDA:
            #    masks = masks.cuda()

            current_state = torch.FloatTensor(np.float32(next_state))
            rollout.insert(step, current_state, action.data, reward, masks)

        _, next_value = actor_critic(
            autograd.Variable(rollout.states[-1], volatile=True))
        next_value = next_value.data

        returns = rollout.compute_returns(next_value, gamma)

        logit, action_log_probs, values, entropy = actor_critic.evaluate_actions(
            autograd.Variable(rollout.states[:-1]).view(-1, *state_shape),
            autograd.Variable(rollout.actions).view(-1, 1))

        distil_logit, _, _, _ = distil_policy.evaluate_actions(
            autograd.Variable(rollout.states[:-1]).view(-1, *state_shape),
            autograd.Variable(rollout.actions).view(-1, 1))

        distil_loss = 0.01 * (F.softmax(logit).detach() *
                              F.log_softmax(distil_logit)).sum(1).mean()

        values = values.view(num_steps, num_envs, 1)
        action_log_probs = action_log_probs.view(num_steps, num_envs, 1)
        advantages = autograd.Variable(returns) - values

        value_loss = advantages.pow(2).mean()
        action_loss = -(autograd.Variable(advantages.data) *
                        action_log_probs).mean()

        optimizer.zero_grad()
        loss = value_loss * value_loss_coef + action_loss - entropy * entropy_coef
        loss.backward()
        nn.utils.clip_grad_norm(actor_critic.parameters(), max_grad_norm)
        optimizer.step()

        distil_optimizer.zero_grad()
        distil_loss.backward()
        optimizer.step()

        if i_update % 100 == 0:
            all_rewards.append(final_rewards.mean())
            all_losses.append(loss.item())

            #clear_output(True)
            plt.figure(figsize=(20, 5))
            plt.subplot(131)
            plt.title('epoch %s. reward: %s' %
                      (i_update, np.mean(all_rewards[-10:])))
            plt.plot(all_rewards)
            plt.subplot(132)
            plt.title('loss %s' % all_losses[-1])
            plt.plot(all_losses)
            plt.show()

        rollout.after_update()

    torch.save(actor_critic.state_dict(), "i2a_" + mode)
Beispiel #19
0
    log_probs = []
    values = []
    rewards = []
    masks = []
    entropy = 0

    # rollout trajectory
    for _ in range(num_steps):
        state = torch.FloatTensor(state).to(device)  # get a state from env
        dist, value = model(
            state
        )  # run the state through the network to get an action distribution and value of state

        action = dist.sample(
        )  # pick an action from the action distribution output by the model
        next_state, reward, done, _ = envs.step(action.cpu().numpy(
        ))  # take the action, and get a new state and reward

        log_prob = dist.log_prob(action)  # log prob of the action
        entropy += dist.entropy().mean()  # entropy

        log_probs.append(log_prob)  # add the log prob to a list
        values.append(value)  # add a list of predicted values
        rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(
            device))  # add to a list of rewards
        masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))

        state = next_state
        frame_idx += 1

        if frame_idx % 100 == 0:
            test_rewards.append(np.mean([test_env() for _ in range(10)]))
        final_rewards = torch.zeros(num_envs, 1)

    actor_critic.to(DEVICE)
    rollout.to(DEVICE)

    state = envs.reset()
    state = torch.FloatTensor(np.float32(state)).to(DEVICE)

    rollout.states[0].copy_(state)

    for i_update in range(last_num_frames, num_frames):

        for step in range(num_steps):
            action = actor_critic.act(state)

            next_state, reward, done, _ = envs.step(
                action.squeeze(1).cpu().data.numpy())

            reward = torch.FloatTensor(reward).unsqueeze(1)
            episode_rewards += reward
            masks = torch.FloatTensor(1 - np.array(done)).unsqueeze(1)
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            masks.to(DEVICE)

            state = torch.FloatTensor(np.float32(next_state)).to(DEVICE)
            rollout.insert(step, state, action.data, reward, masks)

        with torch.no_grad():
            _, next_value = actor_critic(rollout.states[-1])
Beispiel #21
0
class RolloutCollector:
    def __init__(self, num_env_workers, make_env_func, agent, batch_size,
                 rollout_length, state_shape, action_shape, stats):
        ''' -one agent is assigned to a collector. 
            -a collector runs a bunch of envs in paralel to feed to that agent
            -you could run a bunch of collectors simultaniously, 
                |-  and then use weight mixing on the agents seperately
        '''
        #self.storage_device = torch.device("cpu")

        self.num_env_workers = num_env_workers
        self.envs = SubprocVecEnv(
            [make_env_func() for i in range(num_env_workers)])
        self.agent = agent
        self.batch_size = batch_size
        self.rollout_length = rollout_length
        self.state_shape = state_shape
        self.action_shape = action_shape
        self.stats = stats

        self.buffer_full = False
        self.GAE_calculated = False

        self.gamma = 0.8
        self.tau = 0.8

        self.rollout_indices = np.zeros(batch_size)
        self.states = torch.zeros(
            (batch_size, rollout_length + 1, *state_shape),
            dtype=torch.float32).to(self.agent.device)
        self.actions = torch.zeros(
            (batch_size, rollout_length + 1, *action_shape),
            dtype=torch.float32).to(self.agent.device)
        self.log_probs = torch.zeros(
            (batch_size, rollout_length + 1, *action_shape),
            dtype=torch.float32).to(self.agent.device)
        self.values = torch.zeros((batch_size, rollout_length + 1, 1),
                                  dtype=torch.float32).to(self.agent.device)
        self.rewards = torch.zeros((batch_size, rollout_length + 1, 1),
                                   dtype=torch.float32).to(self.agent.device)
        self.done_masks = torch.zeros(
            (batch_size, rollout_length + 1, 1),
            dtype=torch.float32).to(self.agent.device)
        self.advantages = torch.zeros(
            (batch_size, rollout_length + 1, 1),
            dtype=torch.float32).to(self.agent.device)
        self.returns = torch.zeros((batch_size, rollout_length + 1, 1),
                                   dtype=torch.float32).to(self.agent.device)

        self.state = self.envs.reset()

    def collect_samples(self):
        if self.buffer_full:
            raise Exception(
                "tried to collect more samples when buffer already full")

        num_runs_to_full = math.ceil(self.batch_size / self.num_env_workers)
        with torch.no_grad():
            for collection_run in range(num_runs_to_full):
                start_index = collection_run * self.num_env_workers
                end_index_exclusive = min(start_index + self.num_env_workers,
                                          self.batch_size)
                run_indices = torch.arange(start_index,
                                           end_index_exclusive,
                                           dtype=torch.long)
                worker_indices = run_indices % self.num_env_workers

                for rollout_idx in range(self.rollout_length + 1):
                    state = torch.Tensor(self.state).float().to(
                        self.agent.device)
                    policy_dist = self.agent.actor(state)
                    action = policy_dist.sample()
                    if self.agent.tanh_action_clamping:
                        action = torch.tanh(action)
                    else:
                        action = action.clamp(-1, 1)  #   depends on env
                    cpu_actions = action.cpu().numpy()
                    state_, reward, done, info = self.envs.step(cpu_actions)

                    value = self.agent.critic(state)
                    log_prob = policy_dist.log_prob(action)

                    reward = torch.Tensor(reward).float().unsqueeze(1).to(
                        self.agent.device)
                    done_masks = torch.Tensor(1.0 -
                                              done).float().unsqueeze(1).to(
                                                  self.agent.device)

                    self.states[run_indices,
                                rollout_idx] = state[worker_indices]
                    self.actions[run_indices,
                                 rollout_idx] = action[worker_indices]
                    self.log_probs[run_indices,
                                   rollout_idx] = log_prob[worker_indices]
                    self.values[run_indices,
                                rollout_idx] = value[worker_indices]
                    self.rewards[run_indices,
                                 rollout_idx] = reward[worker_indices]
                    self.done_masks[run_indices,
                                    rollout_idx] = done_masks[worker_indices]

                    self.state = state_

        self.buffer_full = True
        self.stats.update_collection_stats(
            num_samples_collected_inc=self.batch_size * self.rollout_length)

    def compute_gae(self):
        if not self.buffer_full:
            raise Exception(
                "buffer is not full of new samples yet (so not ready for GAE)")

        gae = torch.zeros((self.batch_size, 1)).to(self.agent.device)
        for i in reversed(range(self.rollout_length)):
            delta = self.rewards[:,
                                 i] + self.gamma * self.values[:, i +
                                                               1] * self.done_masks[:,
                                                                                    i] - self.values[:,
                                                                                                     i]
            gae = delta + self.gamma * self.tau * self.done_masks[:, i] * gae
            self.returns[:, i] = gae + self.values[:, i]
            self.advantages[:, i] = gae

        self.GAE_calculated = True

    def random_batch_iter(self):
        if not self.buffer_full and not self.GAE_calculated:
            raise Exception(
                "buffer is not ready for sampling yet. (not full/no GAE)")
        '''-theres no way all the workers are aligned, especially after an episode or so. 
            so we might just be able to use a vertical index'''
        batch_indices = torch.randperm(self.rollout_length)
        for i in range(self.rollout_length):
            index = batch_indices[i]
            state = self.states[:, index]
            action = self.actions[:, index]
            log_prob = self.log_probs[:, index]
            advantage = self.advantages[:, index]
            return_ = self.returns[:, index]
            yield state, action, log_prob, advantage, return_

    def reset(self):
        self.buffer_full = False
        self.GAE_calculated = False
Beispiel #22
0
while not completed:
    # reset data for new epoch, i.e. on ploicy training
    log_probs = []
    values = []
    states = []
    actions = []
    rewards = []
    masks = []

    for i in range(num_steps_for_batch):
        state = torch.tensor(state, dtype=torch.float32, device=device)

        normal_dist, critic_value = model.forward(state)
        action = normal_dist.sample()

        next_state, reward, done, _ = envs.step(action.detach().numpy())

        states.append(state)

        actions.append(action)
        rewards.append(reward)
        masks.append((1 - done))
        log_probs.append(normal_dist.log_prob(action))
        values.append(critic_value)

        state = next_state
        total_steps += 1

        # validation every 4000 steps
        if total_steps % 4000 == 0:
            test_reward, max_distance = test_model(number_of_test_runs)
Beispiel #23
0
class env_cover():
    def __init__(self, config, dev):

        self.dev = dev
        self.num_env = config['num_envs']
        self.get_img_from_render = config['get_img_from_render']

        self.obs_shape = (self.num_env, ) + config['obs_space'][1:]
        #        print(self.obs_shape)
        self.reward_shape = (self.num_env, ) + config['reward_space'][1:]
        self.gamma_shape = (self.num_env, ) + config['gamma_space'][1:]

        if self.num_env == 1:
            self.env = gym.make(config['game_name'])
        else:

            def make_env():
                def _thunk():
                    env = gym.make(config['game_name'])
                    return env

                return _thunk

            envs = [make_env() for i in range(self.num_env)]
            self.env = SubprocVecEnv(envs)

#
#def obs_preproc(x):
#    if IMG_GET_RENDER ==False:
#        return torch.from_numpy(np.resize(x, feature_state)).float().unsqueeze(0)
#    x = np.dot(x, np.array([[0.299, 0.587, 0.114]]).T)
#    x = np.reshape(x, (1,x.shape[1], x.shape[0]))
#    return torch.from_numpy(np.resize(x, feature_state)).float().unsqueeze(0)/255
#

    def reset(self):
        st = self.env.reset()
        if self.get_img_from_render:
            st = self.env.render(mode='rgb_array')
            st = np.resize(st, self.obs_shape) / 255.

        return torch.FloatTensor(st).reshape(self.obs_shape).to(
            self.dev), torch.zeros(self.reward_shape).to(
                self.dev), torch.zeros(self.gamma_shape).to(self.dev)
        #return st, 0,False

#    def get_obs(self,obs):
#        return torch.from_numpy(obs).detach().float().view(1,config['obs_space'])

    def step(self, action):

        st, rt, dt, _ = self.env.step(action)

        if self.get_img_from_render:
            st = self.env.render(mode='rgb_array')
            st = np.resize(st, self.obs_shape) / 255.


#        print(st)
        st = torch.FloatTensor(st).reshape(self.obs_shape).to(self.dev)
        rt = torch.FloatTensor([rt]).reshape(self.reward_shape).to(self.dev)
        if self.num_env == 1:
            dt = torch.FloatTensor([dt]).reshape(self.gamma_shape).to(self.dev)
        else:
            dt = torch.FloatTensor(dt.astype(int)).reshape(
                self.gamma_shape).to(self.dev)

        return st, rt, dt

    def end_dummy(self):
        return torch.zeros(self.obs_shape).to(self.dev), torch.zeros(
            self.reward_shape).to(self.dev), torch.zeros(self.gamma_shape).to(
                self.dev)

    def render(self):
        self.env.render()

    def close(self):
        self.env.close()
Beispiel #24
0
while frame_idx < max_frames:

    log_probs = []
    values = []
    rewards = []
    masks = []
    entropy = 0

    # rollout trajectory
    for _ in range(num_steps):
        state = torch.FloatTensor(state).to(device)
        dist, value = model(state)

        action = dist.sample()
        next_state, reward, done, _ = envs.step(action.cpu().numpy())

        log_prob = dist.log_prob(action)
        entropy += dist.entropy().mean()

        log_probs.append(log_prob)
        values.append(value)
        rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
        masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))

        state = next_state
        frame_idx += 1

        if frame_idx % 100 == 0:
            test_rewards.append(np.mean([test_env() for _ in range(10)]))
            plot(frame_idx, test_rewards)
Beispiel #25
0
def train(args):
	# hyper-params:
	frame_idx  		 = 0
	hidden_size      = args.hidden_size
	lr               = args.lr
	num_steps        = args.num_steps
	mini_batch_size  = args.mini_batch_size
	ppo_epochs       = args.ppo_epochs
	threshold_reward = args.threshold_reward
	max_frames 		 = args.max_frames
	# test_rewards 	 = []
	num_envs 		 = args.num_envs
	test_epochs		 = args.test_epochs
	resume_training	 = args.resume_training
	best_test_reward = 0.0
	urdf_path		 = os.path.join(BASE_DIR, os.pardir, "snake/snake.urdf")
	log_dir 		 = args.log_dir

	now = datetime.now()
	log_dir = log_dir + '_' + now.strftime('%d_%m_%Y_%H_%M_%S')

	# Check cuda availability.
	use_cuda = torch.cuda.is_available()
	device = torch.device("cuda" if use_cuda else "cpu")


	p.connect(p.DIRECT)
	writer = SummaryWriter(log_dir)

	# Create training log.
	textio = utils.IOStream(os.path.join(log_dir, 'train.log'), args=args)
	# textio.log_params(device, num_envs, lr, threshold_reward)	
	utils.logFiles(log_dir)

	# create multiple environments.
	envs = [utils.make_env(p, urdf_path, args=args) for i in range(num_envs)]
	envs = SubprocVecEnv(envs)

	# pdb.set_trace()	# Debug
	num_inputs = envs.observation_space.shape[0]
	num_outputs = envs.action_space.shape[0]

	# Create Policy/Network
	net = ActorCritic(num_inputs, num_outputs, hidden_size).to(device)
	optimizer = optim.Adam(net.parameters(), lr=lr)

	# If use pretrained policy.
	if resume_training:
		if os.path.exists(resume_training):
			checkpoint = torch.load(resume_training)
			frame_idx = checkpoint['frame_idx']
			net.load_state_dict(checkpoint['model'])
			best_test_reward = checkpoint['best_test_reward']

	# Initial Reset for Environment.
	state = envs.reset()
	early_stop = False

	# Create env for policy testing.
	robot = snake.Snake(p, urdf_path, args=args)
	env = SnakeGymEnv(robot, args=args)

	print_('\nTraining Begins ...', color='r', style='bold')
	textio.log('Training Begins ...')
	while frame_idx < max_frames and not early_stop:
		print_('\nTraining Policy!', color='r', style='bold')
		textio.log('\n############## Epoch: %0.5d ##############'%(int(frame_idx/20)))

		# Memory buffers
		log_probs = []
		values    = []
		states    = []
		actions   = []
		rewards   = []
		masks     = []
		entropy   = 0
		total_reward = 0.0

		for i in range(num_steps):
			print('Steps taken: {} & Epoch: {}\r'.format(i, int(frame_idx/20)), end="")
			state = torch.FloatTensor(state).to(device)

			# Find action using policy.
			dist, value = net(state)
			action = dist.sample()
			action = action #HACK

			# Take actions and find MDP.
			next_state, reward, done, _ = envs.step(action.cpu().numpy())
			total_reward += sum(reward)
			textio.log('Steps: {} and Reward: {}'.format(int(frame_idx%20), total_reward))

			# Calculate log(policy)
			log_prob = dist.log_prob(action)
			entropy += dist.entropy().mean()

			# Create Experiences
			log_probs.append(log_prob)
			values.append(value)
			rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
			masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))
			states.append(state)
			actions.append(action)
			
			# Update state.
			state = next_state
			frame_idx += 1

			# Test Trained Policy.
			if frame_idx % 40 == 0:
				print_('\n\nEvaluate Policy!', color='bl', style='bold')
				test_reward = np.mean([utils.test_env(env, net, test_idx) for test_idx in range(test_epochs)])

				# test_rewards.append(test_reward)
				# utils.plot(frame_idx, test_rewards)	# not required due to tensorboardX.
				writer.add_scalar('test_reward', test_reward, frame_idx)
				
				print_('\nTest Reward: {}\n'.format(test_reward), color='bl', style='bold')
				textio.log('Test Reward: {}'.format(test_reward))

				# Save various factors of training.
				snap = {'frame_idx': frame_idx,
						'model': net.state_dict(),
						'best_test_reward': best_test_reward,
						'optimizer' : optimizer.state_dict()}

				if best_test_reward < test_reward:
					save_checkpoint(snap, os.path.join(log_dir, 'weights_bestPolicy.pth'))
					best_test_reward = test_reward
				save_checkpoint(snap, os.path.join(log_dir,'weights.pth'))
				if test_reward > threshold_reward: early_stop = True
			if frame_idx % 1000 == 0:
				if not os.path.exists(os.path.join(log_dir, 'models')): os.mkdir(os.path.join(log_dir, 'models'))
				save_checkpoint(snap, os.path.join(log_dir, 'models', 'weights_%0.5d.pth'%frame_idx))

				
		# Calculate Returns
		next_state = torch.FloatTensor(next_state).to(device)
		_, next_value = net(next_state)
		returns = compute_gae(next_value, rewards, masks, values)

		# Concatenate experiences for multiple environments.
		returns   = torch.cat(returns).detach()
		log_probs = torch.cat(log_probs).detach()
		values    = torch.cat(values).detach()
		states    = torch.cat(states)
		actions   = torch.cat(actions)
		advantage = returns - values
		
		writer.add_scalar('reward/episode', total_reward, frame_idx)
		textio.log('Total Training Reward: {}'.format(total_reward))

		# Update the Policy.
		ppo_update(net, optimizer, ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantage, writer, frame_idx)