def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    # envs = [make_env(args.env_name, args.seed, i, args.log_dir)
    #             for i in range(args.num_processes)]
    envs = [make_env_test()]

    # TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO
    args.num_processes = len(envs)
    # REMEMBER YOU CHENGED IT

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0], *obs_shape[1:])

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space,
                                 args.recurrent_policy)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        # if args.num_stack > 1:
        #     current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    # Load model
    actor_critic = torch.load("./checkpoint.pt")
    print(actor_critic)
    actor_critic = actor_critic[0]

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            #envs.envs[0].render()
            #time.sleep(0.1)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(current_obs, states.data, action.data,
                            action_log_prob.data, value.data, reward, masks)

        next_value = actor_critic.get_value(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True)).data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))
        if args.vis and j % args.vis_interval == 0:
            try:
                print("printed visdom plot")
                # Sometimes monitor doesn't properly flush the outputs
                #win = visdom_plot(viz, win, args.log_dir, args.env_name,
                #                  args.algo, args.num_frames)
            except IOError:
                pass
Exemple #2
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    train_envs = make_vec_envs(args.env_name,
                               args.seed,
                               args.num_processes,
                               args.gamma,
                               args.no_norm,
                               args.num_stack,
                               args.log_dir,
                               args.add_timestep,
                               device,
                               allow_early_resets=False)

    if args.eval_interval:
        eval_envs = make_vec_envs(args.env_name,
                                  args.seed + args.num_processes,
                                  args.num_processes,
                                  args.gamma,
                                  args.no_norm,
                                  args.num_stack,
                                  eval_log_dir,
                                  args.add_timestep,
                                  device,
                                  allow_early_resets=True,
                                  eval=True)

        if eval_envs.venv.__class__.__name__ == "VecNormalize":
            eval_envs.venv.ob_rms = train_envs.venv.ob_rms
    else:
        eval_envs = None

    # FIXME this is very specific to Pommerman env right now
    actor_critic = create_policy(train_envs.observation_space,
                                 train_envs.action_space,
                                 name='pomm',
                                 nn_kwargs={
                                     'batch_norm':
                                     False if args.algo == 'acktr' else True,
                                     'recurrent':
                                     args.recurrent_policy,
                                     'hidden_size':
                                     512,
                                 },
                                 train=True)

    actor_critic.to(device)

    if args.algo.startswith('a2c'):
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               lr_schedule=lr_update_schedule,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo.startswith('ppo'):
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         lr_schedule=lr_update_schedule,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.algo.endswith('sil'):
        agent = algo.SIL(agent,
                         update_ratio=args.sil_update_ratio,
                         epochs=args.sil_epochs,
                         batch_size=args.sil_batch_size,
                         value_loss_coef=args.sil_value_loss_coef
                         or args.value_loss_coef,
                         entropy_coef=args.sil_entropy_coef
                         or args.entropy_coef)
        replay = ReplayStorage(5e5,
                               args.num_processes,
                               args.gamma,
                               0.1,
                               train_envs.observation_space.shape,
                               train_envs.action_space,
                               actor_critic.recurrent_hidden_state_size,
                               device=device)
    else:
        replay = None

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              train_envs.observation_space.shape,
                              train_envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = train_envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = train_envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done],
                                 device=device)
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)
            if replay is not None:
                replay.insert(rollouts.obs[step],
                              rollouts.recurrent_hidden_states[step], action,
                              reward, done)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy, other_metrics = agent.update(
            rollouts, j, replay)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model.state_dict(),
                hasattr(train_envs.venv, 'ob_rms') and train_envs.venv.ob_rms
                or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * update_factor

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {}, last {} mean/median reward {:.1f}/{:.1f}, "
                "min / max reward {:.1f}/{:.1f}, value/action loss {:.5f}/{:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss),
                end=', ' if other_metrics else '\n')
            if 'sil_value_loss' in other_metrics:
                print("SIL value/action loss {:.1f}/{:.1f}.".format(
                    other_metrics['sil_value_loss'],
                    other_metrics['sil_action_loss']))

        if args.eval_interval and len(
                episode_rewards) > 1 and j > 0 and j % args.eval_interval == 0:
            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 50:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)
                eval_masks = torch.tensor([[0.0] if done_ else [1.0]
                                           for done_ in done],
                                          device=device)
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), np.mean(eval_episode_rewards)))

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
def main():
    import copy
    import glob
    import os
    import time
    import matplotlib.pyplot as plt

    import gym
    import numpy as np
    import torch
    torch.multiprocessing.set_start_method('spawn')

    import torch.nn as nn
    import torch.nn.functional as F
    import torch.optim as optim
    from gym.spaces import Discrete

    from arguments import get_args
    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
    from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
    from baselines.common.vec_env.vec_normalize import VecNormalize
    from envs import make_env
    from img_env_corner import ImgEnv, IMG_ENVS
    from model import Policy
    from storage import RolloutStorage
    from utils import update_current_obs, eval_episode
    from torchvision import transforms
    from visdom import Visdom

    import algo

    viz = Visdom(port=8097)

    print("#######")
    print("WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards")
    print("#######")

    plot_rewards = []
    plot_policy_loss = []
    plot_value_loss = []
    # x = np.array([0])
    # y = np.array([0])
    # counter = 0
    # win = viz.line(
    #     X=x,
    #     Y=y,
    #     win="test1",
    #     name='Line1',
    #     opts=dict(
    #         title='Reward',
    #     )
    #     )
    # win2 = viz.line(
    #     X=x,
    #     Y=y,
    #     win="test2",
    #     name='Line2',
    #     opts=dict(
    #         title='Policy Loss',
    #     )
    #     )
    # win3 = viz.line(
    #     X=x,
    #     Y=y,
    #     win="test3",
    #     name='Line3',
    #     opts=dict(
    #         title='Value Loss',
    #     )
    #     )

    args = get_args()
    if args.no_cuda:
        args.cuda = False
    print(args)
    assert args.algo in ['a2c', 'ppo', 'acktr']
    if args.recurrent_policy:
        assert args.algo in ['a2c', 'ppo'], \
            'Recurrent policy is not implemented for ACKTR'

    num_updates = int(args.num_frames) // args.num_steps // args.num_processes

    torch.manual_seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)

    toprint = ['seed', 'lr', 'nat', 'resnet']
    if args.env_name in IMG_ENVS:
        toprint += ['window', 'max_steps']
    toprint.sort()
    name = args.tag
    args_param = vars(args)
    os.makedirs(os.path.join(args.out_dir, args.env_name), exist_ok=True)
    for arg in toprint:
        if arg in args_param and (args_param[arg] or arg in ['gamma', 'seed']):
            if args_param[arg] is True:
                name += '{}_'.format(arg)
            else:
                name += '{}{}_'.format(arg, args_param[arg])
    model_dir = os.path.join(args.out_dir, args.env_name, args.algo)
    os.makedirs(model_dir, exist_ok=True)

    results_dict = {
        'episodes': [],
        'rewards': [],
        'args': args
    }
    torch.set_num_threads(1)
    eval_env = make_env(args, 'cifar10', args.seed, 1, None,
            args.add_timestep, natural=args.nat, train=False)
    envs = make_env(args, 'cifar10', args.seed, 1, None,
            args.add_timestep, natural=args.nat, train=True)
                
    #print(envs)
    # envs = envs[0]
    

    # if args.num_processes > 1:
    #     envs = SubprocVecEnv(envs)
    # else:
    #     envs = DummyVecEnv(envs)
    # eval_env = DummyVecEnv(eval_env)
    # if len(envs.observation_space.shape) == 1:
    #     envs = VecNormalize(envs, gamma=args.gamma)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy,
                          dataset=args.env_name, resnet=args.resnet,
                          pretrained=args.pretrained)
    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, lr=args.lr,
                               eps=args.eps, alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
                         args.value_loss_coef, args.entropy_coef, lr=args.lr,
                               eps=args.eps,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, acktr=True)

    action_space = envs.action_space
    if args.env_name in IMG_ENVS:
        action_space = np.zeros(2)
    # obs_shape = envs.observation_space.shape
    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    obs = envs.reset()
    update_current_obs(obs, current_obs, obs_shape, args.num_stack)
    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        # envs.display_original(j)
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, states = actor_critic.act(
                        rollouts.observations[step],
                        rollouts.states[step],
                        rollouts.masks[step])
            cpu_actions = action.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)

            # envs.display_step(step, j)

            # print("OBS", obs)

            # print("REWARD", reward)
            # print("DONE", done)
            # print("INFO", info)


            reward = torch.from_numpy(np.expand_dims(np.stack([reward]), 1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in [done]])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs, current_obs, obs_shape, args.num_stack)
            rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks)

            # print("envs.curr_img SHAPE: ", envs.curr_img.shape)
            #display_state = envs.curr_img
            # display_state[:, envs.pos[0]:envs.pos[0]+envs.window, envs.pos[1]:envs.pos[1]+envs.window] = 5
            # display_state = custom_replace(display_state, 1, 0)
            # display_state[:, envs.pos[0]:envs.pos[0]+envs.window, envs.pos[1]:envs.pos[1]+envs.window] = \
            #     envs.curr_img[:, envs.pos[0]:envs.pos[0]+envs.window, envs.pos[1]:envs.pos[1]+envs.window]
            # img = transforms.ToPILImage()(display_state)
            # img.save("state_cifar/"+"state"+str(j)+"_"+str(step)+".png")

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.observations[-1],
                                                rollouts.states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        

        if j % args.save_interval == 0:
            torch.save((actor_critic.state_dict(), results_dict), os.path.join(
                model_dir, name + 'cifar_model_ppo_ex1_corner.pt'))

        if j % args.log_interval == 0:
            end = time.time()
            total_reward = eval_episode(eval_env, actor_critic, args)

            

            results_dict['rewards'].append(total_reward)
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print("Updates {}, num timesteps {}, FPS {}, reward {:.1f} entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(j, total_num_steps,
                       int(total_num_steps / (end - start)),
                       np.mean(results_dict['rewards'][-10:]), dist_entropy,
                       value_loss, action_loss))


            plot_rewards.append(np.mean(results_dict['rewards'][-10:]))
            plot_policy_loss.append(action_loss)
            plot_value_loss.append(value_loss)


    plt.plot(range(len(plot_rewards)), plot_rewards)
    plt.savefig("rewards_corner.png")
    plt.close()

    
    plt.plot(range(len(plot_policy_loss)), plot_policy_loss)
    plt.savefig("policyloss_corner.png")
    plt.close()

    
    plt.plot(range(len(plot_value_loss)), plot_value_loss)
    plt.savefig("valueloss_corner.png")
    plt.close()
def main():
    torch.set_num_threads(1)
    device = torch.device("cpu")

    # if args.vis:
    #     from visdom import Visdom
    #     viz = Visdom(port=args.port)
    #     win = None

    # envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
    #                     args.gamma, args.log_dir, args.add_timestep, device, False)

    observation_space = Box(low=0, high=10000, shape=(26,), dtype=np.float32)  # Box(84,84,4)
    action_space = Discrete(7)  # Discrete(4)

    actor_critic = Policy(observation_space.shape, action_space, base_kwargs={'recurrent': None})
    actor_critic.to(device)

    # if args.algo == 'a2c':
    #     agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
    #                            args.entropy_coef, lr=args.lr,
    #                            eps=args.eps, alpha=args.alpha,
    #                            max_grad_norm=args.max_grad_norm)
    # elif args.algo == 'ppo':
    #     agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
    #                      args.value_loss_coef, args.entropy_coef, lr=args.lr,
    #                            eps=args.eps,
    #                            max_grad_norm=args.max_grad_norm)
    # elif args.algo == 'acktr':
    agent = algo.A2C_ACKTR(actor_critic, value_loss_coef=0.1,
                           entropy_coef=0.01, acktr=True)

    rollouts = RolloutStorage(8000, 1, observation_space.shape, action_space, actor_critic.recurrent_hidden_state_size)

    obs = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
    rollouts.obs[0].copy_(torch.Tensor(obs))
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    f = open('poktr_rtmdp_20_2.txt', 'w')
    f.write("\noriginal loss(schedule 6 packets):")
    start = time.time()
    for j in range(num_updates):  # num_updates
        net = Net()
        node_list, path_list = net.read_graph(net.node_list, net.path_list)
        startnode = node_list[0]  # 起始节点
        net.get_data(startnode)
        count = 0
        remove_count = 0  # 记录丢弃的数据包的值
        end_time = startnode.messages[0].end_time
        pre_action_item = random.randint(0, 6)
        pre_action_item_oh = convert_one_hot(pre_action_item, 7)
        s = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, end_time, pre_action_item_oh]
        states = [[0], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]  # 用来存储所有节点状态
        ep_r = 0
        ep_acc_r = 0
        obs[:] = s
        reward_ten = torch.Tensor(1, 1)

        pre_value = torch.FloatTensor([[0.1]])
        pre_action = torch.Tensor([[random.randint(0, 6)]])
        pre_action_log_prob = torch.FloatTensor([[-1.]])
        pre_recurrent_hidden_states = torch.FloatTensor([[0.]])
        pre_masks = torch.FloatTensor([[0.]])
        for step in range(8000):
            # Sample actions
            count += 1
            old_action_log_prob = torch.Tensor([[0]])
            # print(rollouts, rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step])
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                        rollouts.obs[step],
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step])
                action_item = action.item()  # 将Tensor类型的数据转化为Int型
                action_item_oh = convert_one_hot(action_item, 7)

            # Obser reward and next obs
            obs, reward, done, states, remove_count, acc_r, su_packets = net.schedule(pre_action_item, count, states, node_list, path_list,
                                                                            remove_count)

            ep_r += reward
            ep_acc_r += acc_r
            reward_ten[[0]] = reward
            # for info in infos:
            #     if 'episode' in info.keys():
            #         episode_rewards.append(info['episode']['r'])
            obs.extend(pre_action_item_oh)
            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done else [1.0]])
            # print((obs), recurrent_hidden_states, torch.Tensor(action), type(action_log_prob), type(value), type(reward), type(masks))
            rollouts.insert(torch.Tensor(obs), recurrent_hidden_states, action, action_log_prob, value, reward_ten, masks)
            # rollouts.insert(torch.Tensor(obs), pre_recurrent_hidden_states, pre_action, pre_action_log_prob, pre_value, reward_ten, pre_masks)

            pre_action = action
            pre_action_item = action_item
            pre_action_log_prob = action_log_prob
            pre_recurrent_hidden_states = recurrent_hidden_states
            pre_value = value
            pre_action_item_oh = convert_one_hot(pre_action_item, 7)

        f.write("\ntime:"+str(time.strftime('%H:%M:%S', time.localtime(time.time())))+"|"+str(j)+"|ep_r:"+str(ep_r)+"|pakcets:"+str(su_packets)+"|remove:"+str(remove_count)+"|ep_acc_r:"+str(ep_acc_r / 8000))
        f.flush()
        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.obs[-1],
                                                rollouts.recurrent_hidden_states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, False, 0.99, 0.95)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        print("time:", time.strftime('%H:%M:%S', time.localtime(time.time())), "|", j, "|ep_r:", ep_r, "|pakcets:",
              su_packets, "|remove:", remove_count, "|ep_acc_r:", ep_acc_r / 8000, "|value_loss:", value_loss,
              "|action_loss:", action_loss, "|entropy:", dist_entropy)
        rollouts.after_update()
Exemple #5
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                        args.gamma, args.log_dir, args.add_timestep, device, False)

    actor_critic = Policy(envs.observation_space.shape, envs.action_space,
        base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, lr=args.lr,
                               eps=args.eps, alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
                         args.value_loss_coef, args.entropy_coef, lr=args.lr,
                               eps=args.eps,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                        envs.observation_space.shape, envs.action_space,
                        actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                        rollouts.obs[step],
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.obs[-1],
                                                rollouts.recurrent_hidden_states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [save_model,
                          getattr(get_vec_normalize(envs), 'ob_rms', None)]

            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n".
                format(j, total_num_steps,
                       int(total_num_steps / (end - start)),
                       len(episode_rewards),
                       np.mean(episode_rewards),
                       np.median(episode_rewards),
                       np.min(episode_rewards),
                       np.max(episode_rewards), dist_entropy,
                       value_loss, action_loss))

        if (args.eval_interval is not None
                and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(
                args.env_name, args.seed + args.num_processes, args.num_processes,
                args.gamma, eval_log_dir, args.add_timestep, device, True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(args.num_processes,
                            actor_critic.recurrent_hidden_state_size, device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs, eval_recurrent_hidden_states, eval_masks, deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".
                format(len(eval_episode_rewards),
                       np.mean(eval_episode_rewards)))

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
Exemple #6
0
def main():
	print("#######")
	print(
			"WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards")
	print("#######")

	torch.set_num_threads(1)

	if args.vis:
		from visdom import Visdom
		viz = Visdom(port=args.port)
		win = None
		win_options = None
		win_term = None

	envs = [make_env(args.env_name, args.seed, i, log_dir, args.add_timestep)
	        for i in range(args.num_processes)]

	if args.num_processes > 1:
		envs = SubprocVecEnv(envs)
	else:
		envs = DummyVecEnv(envs)

	if len(envs.observation_space.shape) == 1:
		envs = VecNormalize(envs)

	obs_shape = envs.observation_space.shape
	obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

	if args.algo == 'a2c':
		agent = algo.A2C_ACKTR(obs_shape, envs.action_space, args.recurrent_policy, args.value_loss_coef,
		                       args.entropy_coef, envs, lr=args.lr,
		                       eps=args.eps, alpha=args.alpha,
		                       max_grad_norm=args.max_grad_norm, cuda=args.cuda)

	elif args.algo == 'a2oc':
		agent = algo.A2OC(envs, args, log_dir)

	elif args.algo == 'ppo':
		agent = algo.PPO(obs_shape, envs.action_space, args.recurrent_policy, args.clip_param, args.ppo_epoch, args.num_mini_batch,
		                 args.value_loss_coef, args.entropy_coef, envs, lr=args.lr,
		                 eps=args.eps,
		                 max_grad_norm=args.max_grad_norm, cuda=args.cuda)

	elif args.algo == 'acktr':
		agent = algo.A2C_ACKTR(obs_shape, envs.action_space, args.recurrent_policy, args.value_loss_coef,
		                       args.entropy_coef, acktr=True)

	else:
		raise ValueError('args.algo does not match any expected algorithm')

	rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, agent.envs.action_space, agent.state_size)
	current_obs = torch.zeros(args.num_processes, *obs_shape)

	def update_current_obs(obs):
		shape_dim0 = agent.envs.observation_space.shape[0]
		obs = torch.from_numpy(obs).float()
		if args.num_stack > 1:
			current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
		current_obs[:, -shape_dim0:] = obs

	obs = agent.envs.reset()
	update_current_obs(obs)

	rollouts.observations[0].copy_(current_obs)

	# These variables are used to compute average rewards for all processes.
	episode_rewards = torch.zeros([args.num_processes, 1])
	final_rewards = torch.zeros([args.num_processes, 1])

	if args.cuda:
		current_obs = current_obs.cuda()
		rollouts.cuda()

	title_name = args.env_name + " | " + args.algo
	if args.algo == "a2oc":
		title_name += " | "+ str(args.num_options) + " | " + str(args.delib)

	start = time.time()
	for j in range(num_updates):
		for step in range(args.num_steps):
			# Sample actions
			with torch.no_grad():
				value, action, action_log_prob, states, obs, reward, done, _ = agent.act(
						rollouts.observations[step],
						rollouts.states[step],
						rollouts.masks[step])

			episode_rewards += reward

			# If done then clean the history of observations.
			masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done])
			final_rewards *= masks
			final_rewards += (1 - masks) * episode_rewards
			episode_rewards *= masks

			if args.cuda:
				masks = masks.cuda()

			if current_obs.dim() == 4:
				current_obs *= masks.unsqueeze(2).unsqueeze(2)
			else:
				current_obs *= masks

			update_current_obs(obs)
			rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks)

		with torch.no_grad():
			next_value = agent.get_value(rollouts.observations[-1],
			                             rollouts.states[-1],
			                             rollouts.masks[-1]).detach()

		rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

		value_loss, action_loss, termination_loss, dist_entropy = agent.update(rollouts)

		rollouts.after_update()

		if j % args.save_interval == 0 and args.save_dir != "":
			save_path = os.path.join(args.save_dir)
			try:
				os.makedirs(save_path)
			except OSError:
				pass

			# A really ugly way to save a model to CPU
			save_model = agent.actor_critic
			if args.cuda:
				save_model = copy.deepcopy(agent.actor_critic).cpu()

			save_model = [save_model,
			              hasattr(agent.envs, 'ob_rms') and agent.envs.ob_rms or None]

			torch.save(save_model, os.path.join(save_path, exp_config_str, exp_config_str + ".pt"))

		if j % args.log_interval == 0:
			end = time.time()
			total_num_steps = (j + 1) * args.num_processes * args.num_steps
			agent.log(total_num_steps)
			print(
				"Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}, termination_loss: {:.5f}".
					format(j, total_num_steps,
				           int(total_num_steps / (end - start)),
				           final_rewards.mean(),
				           final_rewards.median(),
				           final_rewards.min(),
				           final_rewards.max(), dist_entropy,
				           value_loss, action_loss, termination_loss))
		if args.vis and j % args.vis_interval == 0:
			try:
				# Sometimes monitor doesn't properly flush the outputs
				win = visdom_plot(viz, win, log_dir, title_name,
				                  args.algo, args.num_frames)
				if args.algo == 'a2oc':
					win_options = options_plot(viz, win_options, args.num_frames, title_name, agent.log_options_file)
					win_term = term_prob_plot(viz, win_term, args.num_frames, title_name, agent.log_term_prob)
			except IOError:
				pass