Exemple #1
0
    def initialise_policy(self):

        # initialise policy network
        policy_net = Policy(
            args=self.args,
            #
            pass_state_to_policy=self.args.pass_state_to_policy,
            pass_latent_to_policy=self.args.pass_latent_to_policy,
            pass_belief_to_policy=self.args.pass_belief_to_policy,
            pass_task_to_policy=self.args.pass_task_to_policy,
            dim_state=self.args.state_dim,
            dim_latent=self.args.latent_dim * 2,
            dim_belief=self.args.belief_dim,
            dim_task=self.args.task_dim,
            #
            hidden_layers=self.args.policy_layers,
            activation_function=self.args.policy_activation_function,
            policy_initialisation=self.args.policy_initialisation,
            #
            action_space=self.envs.action_space,
            init_std=self.args.policy_init_std,
        ).to(device)

        # initialise policy trainer
        if self.args.policy == 'a2c':
            policy = A2C(
                self.args,
                policy_net,
                self.args.policy_value_loss_coef,
                self.args.policy_entropy_coef,
                policy_optimiser=self.args.policy_optimiser,
                policy_anneal_lr=self.args.policy_anneal_lr,
                train_steps=self.num_updates,
                optimiser_vae=self.vae.optimiser_vae,
                lr=self.args.lr_policy,
                eps=self.args.policy_eps,
            )
        elif self.args.policy == 'ppo':
            policy = PPO(
                self.args,
                policy_net,
                self.args.policy_value_loss_coef,
                self.args.policy_entropy_coef,
                policy_optimiser=self.args.policy_optimiser,
                policy_anneal_lr=self.args.policy_anneal_lr,
                train_steps=self.num_updates,
                lr=self.args.lr_policy,
                eps=self.args.policy_eps,
                ppo_epoch=self.args.ppo_num_epochs,
                num_mini_batch=self.args.ppo_num_minibatch,
                use_huber_loss=self.args.ppo_use_huberloss,
                use_clipped_value_loss=self.args.ppo_use_clipped_value_loss,
                clip_param=self.args.ppo_clip_param,
                optimiser_vae=self.vae.optimiser_vae,
            )
        else:
            raise NotImplementedError

        return policy
Exemple #2
0
    def initialise_policy(self):

        if hasattr(self.envs.action_space, 'low'):
            action_low = self.envs.action_space.low
            action_high = self.envs.action_space.high
        else:
            action_low = action_high = None

        # initialise policy network
        policy_net = Policy(
            args=self.args,
            #
            pass_state_to_policy=self.args.pass_state_to_policy,
            pass_latent_to_policy=
            False,  # use metalearner.py if you want to use the VAE
            pass_belief_to_policy=self.args.pass_belief_to_policy,
            pass_task_to_policy=self.args.pass_task_to_policy,
            dim_state=self.args.state_dim,
            dim_latent=0,
            dim_belief=self.args.belief_dim,
            dim_task=self.args.task_dim,
            #
            hidden_layers=self.args.policy_layers,
            activation_function=self.args.policy_activation_function,
            policy_initialisation=self.args.policy_initialisation,
            #
            action_space=self.envs.action_space,
            init_std=self.args.policy_init_std,
            norm_actions_of_policy=self.args.norm_actions_of_policy,
            action_low=action_low,
            action_high=action_high,
        ).to(device)

        # initialise policy trainer
        if self.args.policy == 'a2c':
            policy = A2C(
                self.args,
                policy_net,
                self.args.policy_value_loss_coef,
                self.args.policy_entropy_coef,
                policy_optimiser=self.args.policy_optimiser,
                policy_anneal_lr=self.args.policy_anneal_lr,
                train_steps=self.num_updates,
                lr=self.args.lr_policy,
                eps=self.args.policy_eps,
            )
        elif self.args.policy == 'ppo':
            policy = PPO(
                self.args,
                policy_net,
                self.args.policy_value_loss_coef,
                self.args.policy_entropy_coef,
                policy_optimiser=self.args.policy_optimiser,
                policy_anneal_lr=self.args.policy_anneal_lr,
                train_steps=self.num_updates,
                lr=self.args.lr_policy,
                eps=self.args.policy_eps,
                ppo_epoch=self.args.ppo_num_epochs,
                num_mini_batch=self.args.ppo_num_minibatch,
                use_huber_loss=self.args.ppo_use_huberloss,
                use_clipped_value_loss=self.args.ppo_use_clipped_value_loss,
                clip_param=self.args.ppo_clip_param,
            )
        else:
            raise NotImplementedError

        return policy
Exemple #3
0
env = gym.make(env_name)
seed = 123
env.seed(seed)
torch.manual_seed(seed)
log_interval = 10
lr = 1e-3

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

print('Env Name: %s | Seed: %d | State_dim: %d | Action_dim: %d | Algo: %s ' %
      (env_name, seed, state_dim, action_dim, algorithm_name))

if algorithm_name == 'a2c':
    model = A2C(state_dim, action_dim, lr=lr)
elif algorithm_name == 'trajcv':
    model = TrajCVPolicy(state_dim, action_dim, lr=lr)
else:
    raise NotImplementedError('Not such algorithm.')

plotter = Plotter("%s_%s_plot" % (algorithm_name, env_name), log_interval)


def main():
    running_reward = 0

    for i_episode in range(501):

        state = env.reset()
        ep_reward = 0
Exemple #4
0
def main():

    # make the environments
    if args.num_envs == 1:
        env = [gym.make(args.env_name)]
    else:
        env = [gym.make(args.env_name) for i in range(args.num_envs)]

    env = MultiGym(env, render=args.render)

    n_states = env.observation_space.shape
    n_actions = env.action_space.n
    print('state shape:', n_states, 'actions:', n_actions)

    policy = ConvPolicy(n_actions).to(device)
    optimizer = optim.RMSprop(policy.parameters(), lr=args.lr)

    if args.algo == 'ppo':
        sys.path.append('../')
        from algorithms.ppo import PPO
        update_algo = PPO(policy=policy,
                          optimizer=optimizer,
                          num_steps=args.num_steps,
                          num_envs=args.num_envs,
                          state_size=(4, 105, 80),
                          entropy_coef=args.entropy,
                          gamma=args.gamma,
                          device=device,
                          epochs=args.ppo_epochs)
    else:
        sys.path.append('../')
        from algorithms.a2c import A2C
        update_algo = A2C(policy=policy,
                          optimizer=optimizer,
                          num_steps=args.num_steps,
                          num_envs=args.num_envs,
                          state_size=(4, 105, 80),
                          entropy_coef=args.entropy,
                          gamma=args.gamma,
                          device=device)

    end_rewards = []

    try:
        print('starting episodes')
        idx = 0
        d = False
        reward_sum = np.zeros((args.num_envs))
        restart = True
        frame = env.reset()
        mask = torch.ones(args.num_envs)
        all_start = time.time()

        for update_idx in range(args.num_updates):
            update_algo.policy.train()

            # stack the frames
            s = train_state_proc.proc_state(frame, mask=mask)

            # insert state before getting actions
            update_algo.states[0].copy_(s)

            start = time.time()
            for step in range(args.num_steps):

                with torch.no_grad():
                    # get probability dist and values
                    p, v = update_algo.policy(update_algo.states[step])
                    a = Categorical(p).sample()

                # take action get response
                frame, r, d = env.step(
                    a.cpu().numpy() if args.num_envs > 1 else [a.item()])
                s = train_state_proc.proc_state(frame, mask)

                update_algo.insert_experience(step=step,
                                              s=s,
                                              a=a,
                                              v=v,
                                              r=r,
                                              d=d)

                mask = torch.tensor(1. - d).float()
                reward_sum = (reward_sum + r)

                # if any episode finished append episode reward to list
                if d.any():
                    end_rewards.extend(reward_sum[d])

                # reset any rewards that finished
                reward_sum = reward_sum * mask.numpy()

                idx += 1

            with torch.no_grad():
                _, next_val = update_algo.policy(update_algo.states[-1])

            update_algo.update(next_val.view(1, args.num_envs).to(device),
                               next_mask=mask.to(device))

            if args.lr_decay:
                for params in update_algo.optimizer.param_groups:
                    params['lr'] = (
                        lr_min + 0.5 * (args.lr - lr_min) *
                        (1 + np.cos(np.pi * idx / args.num_updates)))

            # update every so often by displaying results in term
            if (update_idx % args.log_interval
                    == 0) and (len(end_rewards) > 0):
                total_steps = (idx + 1) * args.num_envs * args.num_steps
                end = time.time()
                print(end_rewards[-10:])
                print('Updates {}\t  Time: {:.4f} \t FPS: {}'.format(
                    update_idx, end - start,
                    int(total_steps / (end - all_start))))
                print(
                    'Mean Episode Rewards: {:.2f} \t Min/Max Current Rewards: {}/{}'
                    .format(np.mean(end_rewards[-10:]), reward_sum.min(),
                            reward_sum.max()))

    except KeyboardInterrupt:
        pass

    torch.save(
        update_algo.policy.state_dict(),
        '../model_weights/{}_{}_conv.pth'.format(args.env_name, args.algo))

    import pandas as pd

    out_dict = {'avg_end_rewards': end_rewards}
    out_log = pd.DataFrame(out_dict)
    out_log.to_csv('../logs/{}_{}_rewards.csv'.format(args.env_name,
                                                      args.algo),
                   index=False)

    out_dict = {
        'actor losses': update_algo.actor_losses,
        'critic losses': update_algo.critic_losses,
        'entropy': update_algo.entropy_logs
    }
    out_log = pd.DataFrame(out_dict)
    out_log.to_csv('../logs/{}_{}_training_behavior.csv'.format(
        args.env_name, args.algo),
                   index=False)

    plt.plot(end_rewards)
    plt.show()
Exemple #5
0
def worker(worker_id, algorithm_name, seed, return_dict):
    print('Worker %d (pid: %d) has started: algorithm_name <%s> seed <%d>.' %
          (worker_id, os.getpid(), algorithm_name, seed))
    env = gym.make(env_name)
    env.seed(seed)
    torch.manual_seed(seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n

    if algorithm_name == 'a2c':
        model = A2C(state_dim,
                    action_dim,
                    lr=lr,
                    gamma=gamma,
                    v_update_epochs=v_update_epochs,
                    v_update_anneal=v_update_anneal,
                    epsilon_greedy_threshold=epsilon_greedy_threshold,
                    epsilon_anneal=epsilon_anneal,
                    re_sample_batch_size=re_sample_batch_size)
    elif algorithm_name == 'trajcv':
        model = TrajCVPolicy(state_dim,
                             action_dim,
                             lr=lr,
                             gamma=gamma,
                             v_update_epochs=v_update_epochs,
                             v_update_anneal=v_update_anneal,
                             epsilon_greedy_threshold=epsilon_greedy_threshold,
                             epsilon_anneal=epsilon_anneal,
                             re_sample_batch_size=re_sample_batch_size)
    else:
        raise NotImplementedError('Not such algorithm.')

    reward_records = []

    running_reward = 0

    for i_episode in range(400):

        state = env.reset()
        ep_reward = 0

        for t in range(1, 10000):

            action = model.select_action(state)

            state, reward, done, _ = env.step(action)

            model.save_reward(reward)
            model.save_state_action(state, action, done)

            ep_reward += reward

            if done:
                break

        running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
        reward_records.append(running_reward)

        model.finish_episode()

        if i_episode % log_interval == 0:
            print(
                '{:>10}(pid:{:>4}-worker_id:{:>2})|Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'
                .format(algorithm_name, seed, worker_id, i_episode, ep_reward,
                        running_reward))

    env.close()
    return_dict[worker_id] = reward_records
    print('Worker %d has ended.' % worker_id)
Exemple #6
0
    def initialise_policy(self):

        # variables for task encoder (used for oracle)
        state_dim = self.envs.observation_space.shape[0]

        # TODO: this isn't ideal, find a nicer way to get the task dimension!
        if 'BeliefOracle' in self.args.env_name:
            task_dim = gym.make(self.args.env_name).observation_space.shape[0] - \
                       gym.make(self.args.env_name.replace('BeliefOracle', '')).observation_space.shape[0]
            latent_dim = self.args.latent_dim
            state_embedding_size = self.args.state_embedding_size
            use_task_encoder = True
        elif 'Oracle' in self.args.env_name:
            task_dim = gym.make(self.args.env_name).observation_space.shape[0] - \
                       gym.make(self.args.env_name.replace('Oracle', '')).observation_space.shape[0]
            latent_dim = self.args.latent_dim
            state_embedding_size = self.args.state_embedding_size
            use_task_encoder = True
        else:
            task_dim = latent_dim = state_embedding_size = 0
            use_task_encoder = False

        # initialise rollout storage for the policy
        self.policy_storage = OnlineStorage(
            self.args,
            self.args.policy_num_steps,
            self.args.num_processes,
            self.args.obs_dim,
            self.args.act_space,
            hidden_size=0,
            latent_dim=self.args.latent_dim,
            normalise_observations=self.args.norm_obs_for_policy,
            normalise_rewards=self.args.norm_rew_for_policy,
        )

        if hasattr(self.envs.action_space, 'low'):
            action_low = self.envs.action_space.low
            action_high = self.envs.action_space.high
        else:
            action_low = action_high = None

        # initialise policy network
        policy_net = Policy(
            # general
            state_dim=int(self.args.condition_policy_on_state) * state_dim,
            action_space=self.envs.action_space,
            init_std=self.args.policy_init_std,
            hidden_layers=self.args.policy_layers,
            activation_function=self.args.policy_activation_function,
            use_task_encoder=use_task_encoder,
            # task encoding things (for oracle)
            task_dim=task_dim,
            latent_dim=latent_dim,
            state_embed_dim=state_embedding_size,
            #
            normalise_actions=self.args.normalise_actions,
            action_low=action_low,
            action_high=action_high,
        ).to(device)

        # initialise policy
        if self.args.policy == 'a2c':
            # initialise policy trainer (A2C)
            self.policy = A2C(
                policy_net,
                self.args.policy_value_loss_coef,
                self.args.policy_entropy_coef,
                lr=self.args.lr_policy,
                eps=self.args.policy_eps,
                alpha=self.args.a2c_alpha,
            )
        elif self.args.policy == 'ppo':
            # initialise policy network
            self.policy = PPO(
                policy_net,
                self.args.policy_value_loss_coef,
                self.args.policy_entropy_coef,
                lr=self.args.lr_policy,
                eps=self.args.policy_eps,
                ppo_epoch=self.args.ppo_num_epochs,
                num_mini_batch=self.args.ppo_num_minibatch,
                use_huber_loss=self.args.ppo_use_huberloss,
                use_clipped_value_loss=self.args.ppo_use_clipped_value_loss,
                clip_param=self.args.ppo_clip_param,
            )
        else:
            raise NotImplementedError
                      state_size=n_states,
                      entropy_coef=args.entropy,
                      gamma=args.gamma,
                      device=device,
                      recurrent=True,
                      rnn_size=args.hid_size,
                      epochs=args.ppo_epochs,
                      batch_size=args.batch_size)
else:
    sys.path.append('../')
    from algorithms.a2c import A2C
    update_algo = A2C(policy=policy,
                      optimizer=optimizer,
                      num_steps=args.num_steps,
                      num_envs=args.num_envs,
                      state_size=n_states,
                      entropy_coef=args.entropy,
                      gamma=args.gamma,
                      device=device,
                      recurrent=True,
                      rnn_size=args.hid_size)

end_rewards = []
gt = 0


def main():
    try:
        print('starting episodes')
        d = False
        idx = 0
        episodes = 0