Exemple #1
0
def main():
    print("#######")
    print("WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards")
    print("#######")

    model_dir = Path('./models') / args.env_name / args.log_dir
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in
                         model_dir.iterdir() if
                         str(folder.name).startswith('run')]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(str(log_dir))
    args.log_dir = str(log_dir)
    print('saving to', args.log_dir)

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = [make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep)
                for i in range(args.num_processes)]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    obs_shape = envs.observation_space.n,

    actor_critic = Policy(obs_shape, envs.action_space, args.dual_type, args.dual_rank, args.dual_emb_dim)

    if args.cuda:
        actor_critic.cuda()

    agent = algo.A2C_ACKTR(actor_critic=actor_critic, value_loss_coef=args.value_loss_coef,
                               entropy_coef=args.entropy_coef, dual_act_coef=args.dual_act_coef,
                               dual_state_coef=args.dual_state_coef, dual_sup_coef=args.dual_sup_coef,
                               policy_coef=args.policy_coef, emb_coef=args.dual_emb_coef,
                               demo_eta=args.demo_eta, demo_eps=args.demo_eps,
                               lr=args.lr,
                               eps=args.eps, alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.base.state_size)
    current_obs = torch.zeros(args.num_processes)
    def update_current_obs(obs):
        obs = torch.from_numpy(obs).float()
        current_obs[:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, states = actor_critic.act(
                        rollouts.observations[step],
                        rollouts.states[step],
                        rollouts.masks[step])
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)

            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()


            current_obs *= masks.squeeze(1)

            update_current_obs(obs)
            rollouts.insert(current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.observations[-1],
                                                rollouts.states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        value_loss, action_loss, dist_entropy, \
        dual_act_loss, dual_state_loss, dual_sup, emb_loss, \
        state_acc, action_acc, sup_acc, miss_rate = agent.update(rollouts)
        
        rollouts.after_update()

        if j % args.save_interval == 0:
            save_path = run_dir / 'incremental'
            if not save_path.exists():
                os.makedirs(str(save_path))

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [save_model,
                            hasattr(envs, 'ob_rms') and envs.ob_rms or None]

            torch.save(save_model, str(save_path / ("model_ep_%i.pt" % j)))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f},"
                  "\t entropy {:.3f}, v {:.3f}, p {:.3f}, d-act {:.3f}/{:.3f}, d-state {:.3f}/{:.3f}, d-sup {:.3f}/{:.3f}/{:.3f}, emb {:.3f}".
                format(j, total_num_steps,
                       int(total_num_steps / (end - start)),
                       final_rewards.mean(),
                       final_rewards.median(),
                       final_rewards.min(),
                       final_rewards.max(), dist_entropy,
                       value_loss, action_loss, dual_act_loss, action_acc, dual_state_loss, state_acc, dual_sup, sup_acc, miss_rate, emb_loss))
        if args.vis and j % args.vis_interval == 0:
            try:
            #Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames)
            except IOError:
                pass
Exemple #2
0
def main():
    config = None
    args = get_args()
    config, checkpoint = get_config_and_checkpoint(args)

    set_random_seeds(args, config)
    eval_log_dir = args.save_dir + "_eval"
    try:
        os.makedirs(args.save_dir)
        os.makedirs(eval_log_dir)
    except OSError:
        pass

    now = datetime.datetime.now()
    experiment_name = args.experiment_name + '_' + now.strftime("%Y-%m-%d_%H-%M-%S")

    # Create checkpoint file
    save_dir_model = os.path.join(args.save_dir, 'model', experiment_name)
    save_dir_config = os.path.join(args.save_dir, 'config', experiment_name)
    try:
        os.makedirs(save_dir_model)
        os.makedirs(save_dir_config)
    except OSError as e:
        logger.error(e)
        exit()

    if args.config:
        shutil.copy2(args.config, save_dir_config)

    curriculum = args.follow_curriculum
    if args.follow_curriculum:
        print('Using preset curriculum')

    # Tensorboard Logging
    writer = SummaryWriter(os.path.join(args.save_dir, 'tensorboard', experiment_name))

    # Logger that writes to STDOUT and a file in the save_dir
    logger = setup_carla_logger(args.save_dir, experiment_name)

    device = torch.device("cuda:0" if args.cuda else "cpu")
    norm_reward = not config.no_reward_norm
    norm_obs = not config.no_obs_norm

    assert not (config.num_virtual_goals > 0) or (config.reward_class == 'SparseReward'), 'Cant use HER with dense reward'
    obs_converter = CarlaObservationConverter(h=84, w=84, rel_coord_system=config.rel_coord_system)
    action_converter = CarlaActionsConverter(config.action_type)
    envs = make_vec_envs(obs_converter, action_converter, args.starting_port, config.seed, config.num_processes,
                                config.gamma, device, config.reward_class, num_frame_stack=1, subset=config.experiments_subset,
                                norm_reward=norm_reward, norm_obs=norm_obs, apply_her=config.num_virtual_goals > 0,
                                video_every=args.video_interval,
                                video_dir=os.path.join(args.save_dir, 'video',
                                    experiment_name),
                                curriculum=curriculum)

    if config.agent == 'forward':
        agent = agents.ForwardCarla()

    if config.agent == 'vpg':
        agent = agents.VPGCarla(obs_converter,
                                action_converter,
                                config.value_loss_coef,
                                config.entropy_coef,
                                lr=config.lr,
                                eps=config.eps, alpha=config.alpha,
                                gamma=config.gamma,
                                max_grad_norm=config.max_grad_norm)

    if config.agent == 'a2c':
        agent = agents.A2CCarla(obs_converter,
                                action_converter,
                                config.value_loss_coef,
                                config.entropy_coef,
                                lr=config.lr,
                                eps=config.eps, alpha=config.alpha,
                                max_grad_norm=config.max_grad_norm)

    elif config.agent == 'acktr':
        agent = agents.A2CCarla(obs_converter,
                                action_converter,
                                config.value_loss_coef,
                                config.entropy_coef,
                                lr=config.lr,
                                eps=config.eps, alpha=config.alpha,
                                max_grad_norm=config.max_grad_norm,
                                acktr=True)

    elif config.agent == 'ppo':
        agent = agents.PPOCarla(obs_converter,
                                action_converter,
                                config.clip_param,
                                config.ppo_epoch,
                                config.num_mini_batch,
                                config.value_loss_coef,
                                config.entropy_coef,
                                lr=config.lr,
                                eps=config.eps,
                                max_grad_norm=config.max_grad_norm)

    if checkpoint is not None:
        load_modules(agent.optimizer, agent.model, checkpoint)

    rollouts = RolloutStorage(config.num_steps, config.num_processes,
                        envs.observation_space, envs.action_space, 20,
                        config.num_virtual_goals, config.rel_coord_system, obs_converter)

    obs = envs.reset()
    # Save the first observation
    obs = obs_to_dict(obs)
    rollouts.obs = obs_to_dict(rollouts.obs)
    for k in rollouts.obs:
        rollouts.obs[k][rollouts.step + 1].copy_(obs[k])
    rollouts.obs = dict_to_obs(rollouts.obs)
    rollouts.to(device)

    start = time.time()


    total_steps = 0
    total_episodes = 0
    total_reward = 0

    episode_reward = torch.zeros(config.num_processes)


    for j in range(config.num_updates):

        for step in range(config.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = agent.act(
                        rollouts.get_obs(step),
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step])

            # Observe reward and next obs
            obs, reward, done, info = envs.step(action)

            # For logging purposes
            carla_rewards = torch.tensor([i['carla-reward'] for i in info], dtype=torch.float)
            episode_reward += carla_rewards
            total_reward += carla_rewards.sum().item()
            total_steps += config.num_processes * config.num_steps

            if done.any():
                total_episodes += done.sum()
                torch_done = torch.tensor(done.astype(int)).byte()
                mean_episode_reward = episode_reward[torch_done].mean().item()
                logger.info('{} episode(s) finished with reward {}'.format(done.sum(), mean_episode_reward))
                writer.add_scalar('train/mean_ep_reward_vs_steps', mean_episode_reward, total_steps)
                writer.add_scalar('train/mean_ep_reward_vs_episodes', mean_episode_reward, total_episodes)
                episode_reward[torch_done] = 0

            # If done then clean the history of observations.
            masks = torch.FloatTensor(1-done)

            rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks.unsqueeze(-1))

        if config.num_virtual_goals > 0:
            rollouts.apply_her(config.num_virtual_goals, device, beta=config.beta)

        with torch.no_grad():
            next_value = agent.get_value(rollouts.get_obs(-1), # Get last observation
                                         rollouts.recurrent_hidden_states[-1],
                                         rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, config.use_gae, config.gamma, config.tau)


        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "" and config.agent !='forward':
            save_path = os.path.join(save_dir_model, str(j) + '.pth.tar')
            save_modules(agent.optimizer, agent.model, args, config, save_path)

        total_num_steps = (j + 1) * config.num_processes * config.num_steps

        if j % args.log_interval == 0:

            # Logging to the stdout/our logs
            end = time.time()
            logger.info('------------------------------------')
            logger.info('Episodes {}, Updates {}, num timesteps {}, FPS {}'\
                .format(total_episodes, j + 1, total_num_steps, total_num_steps / (end - start)))
            logger.info('------------------------------------')


            # Logging to tensorboard
            writer.add_scalar('train/cum_reward_vs_steps', total_reward, total_steps)
            writer.add_scalar('train/cum_reward_vs_updates', total_reward, j+1)

            if config.agent in ['a2c', 'acktr', 'ppo']:
                writer.add_scalar('debug/value_loss_vs_steps', value_loss, total_steps)
                writer.add_scalar('debug/value_loss_vs_updates', value_loss, j+1)
                writer.add_scalar('debug/action_loss_vs_steps', action_loss, total_steps)
                writer.add_scalar('debug/action_loss_vs_updates', action_loss, j+1)
                writer.add_scalar('debug/dist_entropy_vs_steps', dist_entropy, total_steps)
                writer.add_scalar('debug/dist_entropy_vs_updates', dist_entropy, j+1)

            # Sample the last reward
            writer.add_scalar('debug/sampled_normalized_reward_vs_steps', reward.mean(), total_steps)
            writer.add_scalar('debug/sampled_normalized_reward_vs_updates', reward.mean(), j+1)
            writer.add_scalar('debug/sampled_carla_reward_vs_steps', carla_rewards.mean(), total_steps)
            writer.add_scalar('debug/sampled_carla_reward_vs_updates', carla_rewards.mean(), j+1)

        if (args.eval_interval is not None and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(
                args.env_name, args.starting_port, obs_converter, args.x + config.num_processes, config.num_processes,
                config.gamma, eval_log_dir, config.add_timestep, device, True,
                curriculum)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(config.num_processes,
                            20, device=device)
            eval_masks = torch.zeros(config.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = agent.act(
                        obs, eval_recurrent_hidden_states, eval_masks, deterministic=True)

                # Obser reward and next obs
                carla_obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            logger.info(" Evaluation using {} episodes: mean reward {:.5f}\n".
                format(len(eval_episode_rewards),
                       np.mean(eval_episode_rewards)))
Exemple #3
0
def main():
    torch.set_num_threads(1)
    envs = make_vec_envs(
        args_env_name,
        args_seed,
        args_num_processes)

    actor_critic = Policy(
        envs.observation_space.shape,
        envs.action_space)

    agent = PPO(
        actor_critic,
        args_clip_param,
        args_ppo_epoch,
        args_num_mini_batch,
        args_value_loss_coef,
        args_entropy_coef,
        lr=args_lr,
        eps=args_eps,
        max_grad_norm=args_max_grad_norm)
    rollouts = RolloutStorage(
        args_num_steps,
        args_num_processes,
        envs.observation_space.shape)


    obs = envs.reset()
    np.copyto(rollouts.obs[0], obs)

    num_updates = int(
        args_num_env_steps) // args_num_steps // args_num_processes

    episode_rewards = deque(maxlen=10)
    start = time.time()
    sum_re = np.zeros(shape=(args_num_processes, 1))

    for j in range(num_updates):

        for step in range(args_num_steps):
            with torch.no_grad():
                value, action, action_log_prob\
                    = actor_critic.act(rollouts.obs[step])
            print(action)
            ss('hoho')
            obs, reward, done, infos = envs.step(action)
            sum_re += reward

            if any(done):

                for i in range(len(done)):
                    if done[i]:
                        episode_rewards.append(sum_re[i].item())
                        sum_re[i] *= 0
            masks = torch.FloatTensor(
                [[0.0] if done_ else [1.0] for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, action,
                            action_log_prob,
                            value, reward,
                            masks, bad_masks)
        with torch.no_grad():

            next_value = actor_critic.get_value(
                rollouts.obs[-1])

        rollouts.compute_returns(
            next_value,
            args_gamma)
        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        rollouts.after_update()

        if j % args_log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args_num_processes * args_num_steps
            end = time.time()
            print(
                "E {}, N_steps {}, FPS {}"
                " mean/median {:.1f}/{:.1f}, min/max {:.1f}/{:.1f} Ent {:.4f},V {:.4f},A {:.4f}"
                    .format(j, total_num_steps,
                            int(total_num_steps / (end - start)),
                            np.mean(episode_rewards),
                            np.median(episode_rewards), np.min(episode_rewards),
                            np.max(episode_rewards),
                            dist_entropy, value_loss,
                            action_loss))
Exemple #4
0
def main():
    print("#######")
    print("WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards")
    print("#######")

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = [make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep)
                for i in range(args.num_processes)]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs, gamma=args.gamma)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    actor_critic = Policy(obs_shape, envs.action_space,
        base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, lr=args.lr,
                               eps=args.eps, alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
                         args.value_loss_coef, args.entropy_coef, lr=args.lr,
                               eps=args.eps,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
        envs.action_space, actor_critic.recurrent_hidden_state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    obs = envs.reset()
    update_current_obs(obs, current_obs, obs_shape, args.num_stack)
    rollouts.obs[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    current_obs = current_obs.to(device)
    rollouts.to(device)

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                        rollouts.obs[step],
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step])

            cpu_actions = action.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            masks = masks.to(device)

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs, current_obs, obs_shape, args.num_stack)
            rollouts.insert(current_obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.obs[-1],
                                                rollouts.recurrent_hidden_states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [save_model,
                            hasattr(envs, 'ob_rms') and envs.ob_rms or None]

            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(j, total_num_steps,
                       int(total_num_steps / (end - start)),
                       final_rewards.mean(),
                       final_rewards.median(),
                       final_rewards.min(),
                       final_rewards.max(), dist_entropy,
                       value_loss, action_loss))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
Exemple #5
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:1" if args.cuda else "cpu")

    ##
    UID = 'exp_{}'.format(
        datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    step_log = []
    reward_log = []

    ## To be used to selec environment
    mode = 'normal'

    # encoder type
    encoder = 'sym_VAE'
    if encoder == 'symbolic':
        embedding_size = (18, )
    elif encoder == 'AE':
        embedding_size = (200, )
    elif encoder == 'VAE':
        embedding_size = (100, )
    elif encoder == 'sym_VAE':
        embedding_size = (118, )
    else:
        raise NotImplementedError('fff')

    # load pre-trained AE
    #AE = VAEU([128,128])
    #model_path = '/hdd_c/data/miniWorld/trained_models/VAE/dataset_4/VAEU.pth'
    #AE = torch.load(model_path)
    #AE.eval()

    # load pre-trained VAE
    VAE = VAER([128, 128])
    model_path = '/hdd_c/data/miniWorld/trained_models/VAE/dataset_5/VAER.pth'
    VAE = torch.load(model_path).to(device)
    VAE.eval()

    # load pre-trained detector
    Detector_model = Detector
    model_path = '/hdd_c/data/miniWorld/trained_models/Detector/dataset_5/Detector_resnet18_e14.pth'
    Detector_model = torch.load(model_path).to(device)

    # load pre-trained RNN
    RNN_model = RNN(200, 128)
    model_path = '/hdd_c/data/miniWorld/trained_models/RNN/RNN1.pth'
    RNN_model = torch.load(model_path).to(device)
    RNN_model.eval()
    """
    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None
    """

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    print(envs.observation_space.shape)

    #actor_critic = Policy(envs.observation_space.shape, envs.action_space,
    #    base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic = Policy(embedding_size,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    #rollouts = RolloutStorage(args.num_steps, args.num_processes,
    #                    envs.observation_space.shape, envs.action_space,
    #                    actor_critic.recurrent_hidden_state_size)
    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              embedding_size, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    #print(obs.size())
    #obs = make_var(obs)
    print(obs.size())
    with torch.no_grad():
        if encoder == 'symbolic':

            z = Detector_model(obs)
            print(z.size())
            z = Detector_to_symbolic(z)
            rollouts.obs[0].copy_(z)
        elif encoder == 'AE':
            z = AE.encode(obs)
            rollouts.obs[0].copy_(z)
        elif encoder == 'VAE':
            z = VAE.encode(obs)[0]
            rollouts.obs[0].copy_(z)
        elif encoder == 'sym_VAE':
            z_vae = VAE.encode(obs)[0]
            z_sym = Detector_model(obs)
            z_sym = Detector_to_symbolic(z_sym)
            z = torch.cat((z_vae, z_sym), dim=1)
            rollouts.obs[0].copy_(z)
        else:
            raise NotImplementedError('fff')

    #rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=100)

    start = time.time()
    for j in range(num_updates):
        #print(j)
        for step in range(args.num_steps):
            # Sample actions
            #print(step)
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            #print(action)
            with torch.no_grad():
                obs, reward, done, infos = envs.step(action)
                if encoder == 'symbolic':
                    #print(obs.size())
                    np.save(
                        '/hdd_c/data/miniWorld/training_obs_{}.npy'.format(
                            step),
                        obs.detach().cpu().numpy())
                    z = Detector_model(obs / 255.0)
                    z = Detector_to_symbolic(z)
                    #print(z)
                    np.save(
                        '/hdd_c/data/miniWorld/training_z_{}.npy'.format(step),
                        z.detach().cpu().numpy())
                elif encoder == 'AE':
                    z = AE.encode(obs)
                elif encoder == 'VAE':
                    z = VAE.encode(obs)[0]
                elif encoder == 'sym_VAE':
                    z_vae = VAE.encode(obs)[0]
                    z_sym = Detector_model(obs)
                    z_sym = Detector_to_symbolic(z_sym)
                    z = torch.cat((z_vae, z_sym), dim=1)
                else:
                    raise NotImplementedError('fff')
                #obs = make_var(obs)
            """
            for info in infos:
                if 'episode' in info.keys():
                    print(reward)
                    episode_rewards.append(info['episode']['r'])
            """

            #             # FIXME: works only for environments with sparse rewards
            #             for idx, eps_done in enumerate(done):
            #                 if eps_done:
            #                     episode_rewards.append(reward[idx])

            # FIXME: works only for environments with sparse rewards
            for idx, eps_done in enumerate(done):
                if eps_done:
                    #print('done')
                    episode_rewards.append(infos[idx]['accumulated_reward'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            #rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks)
            rollouts.insert(z, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            print('Saving model')
            print()

            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs.venv, 'ob_rms') and envs.venv.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps
        #print(len(episode_rewards))

        step_log.append(total_num_steps)
        reward_log.append(np.mean(episode_rewards))
        step_log_np = np.asarray(step_log)
        reward_log_np = np.asarray(reward_log)
        np.savez_compressed('/hdd_c/data/miniWorld/log/{}.npz'.format(UID),
                            step=step_log_np,
                            reward=reward_log_np)

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}, success rate {:.2f}\n"
                .format(
                    j, total_num_steps, int(total_num_steps / (end - start)),
                    len(episode_rewards), np.mean(episode_rewards),
                    np.median(episode_rewards), np.min(episode_rewards),
                    np.max(episode_rewards),
                    np.count_nonzero(np.greater(episode_rewards, 0)) /
                    len(episode_rewards)))

        if args.eval_interval is not None and len(
                episode_rewards) > 1 and j % args.eval_interval == 0:
            eval_envs = make_vec_envs(args.env_name,
                                      args.seed + args.num_processes,
                                      args.num_processes, args.gamma,
                                      eval_log_dir, args.add_timestep, device,
                                      True)

            if eval_envs.venv.__class__.__name__ == "VecNormalize":
                eval_envs.venv.ob_rms = envs.venv.ob_rms

                # An ugly hack to remove updates
                def _obfilt(self, obs):
                    if self.ob_rms:
                        obs = np.clip((obs - self.ob_rms.mean) /
                                      np.sqrt(self.ob_rms.var + self.epsilon),
                                      -self.clipob, self.clipob)
                        return obs
                    else:
                        return obs

                eval_envs.venv._obfilt = types.MethodType(_obfilt, envs.venv)

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)
                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), np.mean(eval_episode_rewards)))
        """
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
        """
    envs.close()
Exemple #6
0
def main():
    import matplotlib.pyplot as plt

    # You probably won't need this if you're embedding things in a tkinter plot...
    plt.ion()
    x = np.linspace(0, 6 * np.pi, 100)
    y = np.sin(x)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    import time
    line1, = ax.plot([0, 1, 2], [0, 1, 1],
                     'r-')  # Returns a tuple of line objects, thus the comma
    time.sleep(0.01)

    torch.set_num_threads(1)
    args.num_processes = 1
    # device = torch.device("cuda:0" if args.cuda else "cpu")
    device = torch.device("cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = TorchRunner(acc=0.005)
    ob_shape = envs.reset().shape
    # envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
    #                     args.gamma, args.log_dir, args.add_timestep, device, False)
    #
    actor_critic = Policy(ob_shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})

    # # try to load the previous policy
    # data = torch.load(
    #     r"C:\Users\clvco\URA_F18\pytorch-a2c-ppo-acktr\trained_models\ppo\weight_positiverev_test.pt")
    # # # print(data)
    # actor_critic.load_state_dict(data[0].state_dict())
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)
    obs = envs.reset()
    ob_shape = obs.shape
    rollouts = RolloutStorage(args.num_steps, args.num_processes, ob_shape,
                              envs.action_space,
                              (agent.actor_critic.base.output_size), (1),
                              actor_critic.recurrent_hidden_state_size)
    print(args.num_processes)
    print(envs.observation_space.shape)
    print(obs.shape)
    print(rollouts.obs[0].shape)
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = list()
    ep_reward = 0
    import tqdm
    start = time.time()
    print(args)
    print(int(args.num_frames) // args.num_steps // args.num_processes)
    print('NUM', num_updates)
    timestep = 0
    ep_ends = []
    for j in range(num_updates):
        if j == 0:
            print("UPDATING SYNERGY")
            actor_critic.adjust_synergy(0.0)
        for step in tqdm.tqdm(range(args.num_steps)):
            # Sample actions
            timestep += 1
            with torch.no_grad():
                value, action, synergy, q, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)
            ep_reward += reward[0]
            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            if done[0]:
                obs = envs.reset()
                episode_rewards.append(ep_reward)
                ep_ends.append(timestep)
                ep_reward = 0
            # print(action)
            rollouts.insert(obs, recurrent_hidden_states, action, synergy, q,
                            action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [save_model]
            print("Saving model")
            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))
            print("Saved model to: ",
                  os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps
        print("update time", print(len(episode_rewards)))
        if True:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.5f}/{:.5f}, min/max reward {:.5f}/{:.5f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards[-10:]),
                        np.median(episode_rewards[-10:]),
                        np.min(episode_rewards[-10:]),
                        np.max(episode_rewards[-10:]), dist_entropy,
                        value_loss, action_loss))

            import time
            ydata = np.convolve(episode_rewards,
                                np.ones(10) / 10,
                                mode='valid')
            line1.set_xdata(np.arange(0, len(ydata)))
            line1.set_ydata(ydata)
            ax.set_xlim(0, len(ydata))
            ax.set_ylim(min(ydata), max(ydata))
            fig.canvas.draw()
            fig.canvas.flush_events()
            time.sleep(0.01)
            # save the returns
            xdata = np.array(ep_ends)
            ret_dir = 'returns_weight_experiments'
            os.makedirs(ret_dir, exist_ok=True)
            ret_path = ret_dir + '/' + args.env_name + '_' + str(
                args.seed) + '.npy'
            ep_path = ret_dir + '/' + "x_data-" + args.env_name + '_' + str(
                args.seed) + '.npy'
            np.save(ret_path, np.array(np.array(episode_rewards)))
            np.save(ep_path, ep_ends)
Exemple #7
0
def main():
    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir,
                 args.start_container) for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])
    obs_numel = reduce(operator.mul, obs_shape, 1)

    if len(obs_shape) == 3 and obs_numel > 1024:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space,
                                 args.recurrent_policy)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_numel, envs.action_space)

    modelSize = 0
    for p in actor_critic.parameters():
        pSize = reduce(operator.mul, p.size(), 1)
        modelSize += pSize
    print(str(actor_critic))
    print('Total model size: %d' % modelSize)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(),
                               args.lr,
                               eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])
    reward_avg = 0

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step]),
                Variable(rollouts.states[step]),
                Variable(rollouts.masks[step]))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Observation, reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)

            # Maxime: clip the reward within [0,1] for more reliable training
            # This code deals poorly with large reward values
            reward = np.clip(reward, a_min=0, a_max=None) / 400

            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, value.data, reward, masks)

        next_value = actor_critic(Variable(rollouts.observations[-1]),
                                  Variable(rollouts.states[-1]),
                                  Variable(rollouts.masks[-1]))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                Variable(rollouts.masks[:-1].view(-1, 1)),
                Variable(rollouts.actions.view(-1, action_shape)))

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()

            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values -
                                   Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)

            optimizer.step()

        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)

            for e in range(args.ppo_epoch):
                if args.recurrent_policy:
                    data_generator = rollouts.recurrent_generator(
                        advantages, args.num_mini_batch)
                else:
                    data_generator = rollouts.feed_forward_generator(
                        advantages, args.num_mini_batch)

                for sample in data_generator:
                    observations_batch, states_batch, actions_batch, \
                       return_batch, masks_batch, old_action_log_probs_batch, \
                            adv_targ = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                        Variable(observations_batch), Variable(states_batch),
                        Variable(masks_batch), Variable(actions_batch))

                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) -
                                  values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss -
                     dist_entropy * args.entropy_coef).backward()
                    nn.utils.clip_grad_norm(actor_critic.parameters(),
                                            args.max_grad_norm)
                    optimizer.step()

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            reward_avg = 0.99 * reward_avg + 0.01 * final_rewards.mean()
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps

            print(
                "Updates {}, num timesteps {}, FPS {}, running avg reward {:.3f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)), reward_avg,
                        dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))
            """
            print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(
                    j,
                    total_num_steps,
                    int(total_num_steps / (end - start)),
                    final_rewards.mean(),
                    final_rewards.median(),
                    final_rewards.min(),
                    final_rewards.max(), dist_entropy.data[0],
                    value_loss.data[0], action_loss.data[0])
                )
            """

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
            except IOError:
                pass
def main():
    print("config:\n")
    print("activation:", args.activation)
    print("evaluation:", args.evaluation)
    print("evaluation mode:", args.evaluation_mode)
    print("evaluation layer:", args.evaluation_layer)
    writer = SummaryWriter()
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                        args.gamma, args.log_dir, args.add_timestep, device, False)

    actor_critic = Policy(envs.observation_space.shape, envs.action_space,
        base_kwargs={'recurrent': args.recurrent_policy}, activation = args.activation, modulation = args.evaluation)
    # load trained model
    if args.load_model_path != None:
        state_dicts = torch.load(args.load_model_path)
        actor_critic.load_nets(state_dicts)

    actor_critic.to(device)


    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, lr=args.lr,
                               eps=args.eps, alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    # elif args.algo == 'ppo':
    #     agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
    #                      args.value_loss_coef, args.entropy_coef, lr=args.lr,
    #                            eps=args.eps,
    #                            max_grad_norm=args.max_grad_norm)
    # elif args.algo == 'acktr':
    #     agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
    #                            args.entropy_coef, acktr=True)


    tonic_g = 1
    phasic_g = 1
    if args.evaluation and args.evaluation_layer == 1:  # f1 modulation
        tonic_g = args.f1_tonic_g
        phasic_g = args.f1_phasic_g
    if args.evaluation and args.evaluation_layer == 0:  # input activation
        tonic_g = args.input_tonic_g
        phasic_g = args.input_phasic_g

    g = torch.ones(args.num_processes,1)*tonic_g
    g_device = (torch.ones(args.num_processes,1)*tonic_g).to(device)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                        envs.observation_space.shape, envs.action_space,
                        actor_critic.recurrent_hidden_state_size, tonic_g)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    pre_value = [None for i in range(args.num_processes)]
    evaluations = [0 for i in range(args.num_processes)]
    ## to calculate next_value and update g
    next_recurrent_hidden_states = torch.zeros(args.num_processes, actor_critic.recurrent_hidden_state_size).to(device)
    next_g = torch.zeros(args.num_processes,1).to(device)
    next_masks = torch.zeros(args.num_processes,1).to(device)
    next_obs = torch.zeros(args.num_processes, *envs.observation_space.shape).to(device)

    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                        rollouts.obs[step],
                        rollouts.g[step],
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)
            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            # calculate next value with old g and decide new g
            if args.evaluation:
                if args.evaluation_layer == 0:
                    next_obs.copy_(neural_activity(obs,g_device))
                else:
                    next_obs.copy_(obs/255)
                next_recurrent_hidden_states.copy_(recurrent_hidden_states)
                next_g.copy_(g)
                next_masks.copy_(masks)
                with torch.no_grad():
                    next_value = actor_critic.get_value(next_obs,
                                                next_g,
                                                next_recurrent_hidden_states,
                                                next_masks).detach()
                evaluations, g, pre_value = calc_modes(reward, next_value, pre_value, evaluations, args.evaluation_mode, tonic_g, phasic_g, masks)
                g_device.copy_(g)

            # observation processing with new g
            if args.evaluation and args.evaluation_layer == 0:
                obs = neural_activity(obs, g_device)
            else:
                obs = obs/255.0

            for idx in range(len(infos)):
                info = infos[idx]
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
                    steps_done = j*args.num_steps*args.num_processes + step*args.num_processes + idx
                    writer.add_scalar('data/reward', info['episode']['r'], steps_done )
            rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, g)

            # record evaluation value to help decide parameters to switch modes
            if args.evaluation_log:
                writer.add_scalar('data/evaluations', evaluations[0], j*args.num_steps*args.num_processes + step*args.num_processes)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.obs[-1],
                                                rollouts.g[-1],
                                                rollouts.recurrent_hidden_states[-1],
                                                rollouts.masks[-1]).detach()
        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)
        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                    pass

            state_dicts = actor_critic.save_nets()
            torch.save(state_dicts, os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

    writer.export_scalars_to_json("./all_scalars.json")
    writer.close()
Exemple #9
0
def main():
    envs = [make_env(env_name, seed, rank, log_dir) for rank in range(num_processes)]
    envs = SubprocVecEnv(envs)
    obs_shape = envs.observation_space.shape
    obs_shape = [obs_shape[0]*num_stack, *obs_shape[1:]]
    actor_critic = CNNPolicy(obs_shape[0], envs.action_space, False)
    if cuda:
        actor_critic.cuda()
    optimizer = optim.RMSprop(actor_critic.parameters(), lr, eps=eps, alpha=alpha)

    rollouts = RolloutStorage(num_steps, num_processes, obs_shape, envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(num_processes, *obs_shape)
    
    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
            current_obs[:, -shape_dim0:] = obs
            
            obs = envs.reset()
            
    update_current_obs(obs)
    rollouts.observations[0].copy_(current_obs)
    episode_rewards = torch.zeros([num_processes,1])
    final_rewards = torch.zeros([num_processes,1])
    if cuda:
        rollouts.cuda()
        current_obs = current_obs.cuda()
    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]
        
        # test
    start = time.time()
    for j in range(num_updates):
        for step in range(num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(Variable(rollouts.observations[step], volatile=True),
                                                                  Variable(rollouts.states[step], volatile=True),
                                                                  Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze().cpu().numpy()
            #print(cpu_action)

            # obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            # stack: make sure that reward is a numpy array(convert list to ndarray)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks
            if cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            # update obs nad rollouts
            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks)

        # compute current update's return
        next_value = actor_critic(Variable(rollouts.observations[-1], volatile=True),
                                  Variable(rollouts.states[-1], volatile=True),
                                  Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, False, gamma, tau)

        # in a2c the values  were calculated twice
        # the data in rollouts must be viewed, because the shape in rollouts is [num_steps, num_processes, x] which is [num,x] in actor_critic
        values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(rollouts.observations[:-1].view(-1, *obs_shape)),                                                                                       Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                                                                                 Variable(rollouts.masks[:-1].view(-1, 1)),
                                                                                       Variable(rollouts.actions.view(-1, action_shape)))

        # compute the loss
        values = values.view(num_steps, num_processes, 1)
        action_log_probs = action_log_probs.view(num_steps, num_processes, 1)

        advantages = Variable(rollouts.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(Variable(advantages.data) * action_log_probs).mean()

        # update model
        optimizer.zero_grad()
        loss = value_loss * value_loss_coef + action_loss - dist_entropy * entropy_coef 
        loss.backward()
        nn.utils.clip_grad_norm(actor_critic.parameters(), max_grad_norm)
        optimizer.step()

        rollouts.after_update()
        if j % log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * num_processes * num_steps
            print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
            format(j, total_num_steps,
                    int(total_num_steps / (end - start)),
                    final_rewards.mean(),
                    final_rewards.median(),
                    final_rewards.min(),
                    final_rewards.max(), dist_entropy.data[0],
                    value_loss.data[0], action_loss.data[0]))
# todo: test save_url                
    torch.save(actor_critic,save_url)      
def main():

    train_log = Log(log_name+'_train_log')
    evl_log = Log(log_name+'_evaluation_log')
    torch.set_num_threads(1)
    envs = make_vec_envs(
        args_env_name,
        args_seed,
        args_num_processes)
    actor_critic = Policy(
        envs.observation_space.shape,
        envs.action_space)
    agent = PPO(
        actor_critic,
        args_clip_param,
        args_ppo_epoch,
        args_num_mini_batch,
        args_value_loss_coef,
        args_entropy_coef,
        lr=args_lr,
        eps=args_eps,
        max_grad_norm=args_max_grad_norm)
    rollouts = RolloutStorage(
        args_num_steps,
        args_num_processes,
        envs.observation_space.shape,
        envs.action_space)


    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    # print(obs)
    # ss('i am over it')
    num_updates = int(
        args_num_env_steps) // args_num_steps // args_num_processes

    episode_rewards = deque(maxlen=10)
    start = time.time()
    sum_re = torch.zeros(args_num_processes, 1)

    for j in range(num_updates):

        for step in range(args_num_steps):
            with torch.no_grad():
                value, action, action_log_prob\
                    = actor_critic.act(rollouts.obs[step])

            obs, reward, done, infos = envs.step(action)
            sum_re += reward

            if any(done):

                for i in range(len(done)):
                    if done[i]:
                        episode_rewards.append(sum_re[i].item())
                        sum_re[i] *= 0
            masks = torch.FloatTensor(
                [[0.0] if done_ else [1.0] for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, action,
                            action_log_prob,
                            value, reward,
                            masks, bad_masks)
        with torch.no_grad():

            next_value = actor_critic.get_value(
                rollouts.obs[-1])

        rollouts.compute_returns(
            next_value,
            args_gamma)
        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        rollouts.after_update()

        if j % args_log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args_num_processes * args_num_steps
            end = time.time()
            logstring = "E {}, N_steps {}, FPS {} mean/median" \
                        " {:.1f}/{:.1f}, min/max {:.1f}/{:.1f}" \
                        " Entropy {:.5f},V {:.5f},Action {:.5f}".format(
                j, total_num_steps,
                            int(total_num_steps / (end - start)),
                            np.mean(episode_rewards),
                            np.median(episode_rewards), np.min(episode_rewards),
                            np.max(episode_rewards),
                            dist_entropy, value_loss,
                            action_loss)
            # print(logstring)
            train_log.log(logstring)
        # if True:
        if (args_eval_interval is not None and len(episode_rewards) > 1
                and j % args_eval_interval == 0):
            total_num_steps = (j + 1) * args_num_processes * args_num_steps
            ob_rms = get_vec_normalize(envs).ob_rms
            ev_result = evaluate(actor_critic, ob_rms, args_env_name, args_seed,
                     args_num_processes)
            ev_log_string = 'steps:'+str(total_num_steps)+'. '+ev_result
            evl_log.log(ev_log_string)
Exemple #11
0
class Runner():
    def __init__(self, **args):
        cuda = not args['no_cuda'] and torch.cuda.is_available()
        self.device = torch.device("cuda:0" if cuda else "cpu")
        print("Model running on device: {}".format(self.device))
        torch.set_num_threads(1)

        self.env_name = args['env_name']
        self.epochs = args['epochs']
        self.num_processes = args['num_processes']
        self.num_steps = args['num_steps']
        self.num_test_episodes = args['num_test_episodes']
        self.test_every_n_epochs = args['test_every_n_epochs']
        self.use_deterministic_policy_while_testing = args['use_deterministic_policy_while_testing']

        self.grayscale = args['grayscale']
        self.skip_frame = args['skip_frame']
        self.num_frame_stack = args['num_frame_stack']

        self.num_updates_per_epoch = args['num_updates_per_epoch']
        self.num_steps = args['num_steps']

        self.use_gae = args['use_gae']
        self.gamma = args['gamma']
        self.tau = args['tau']

        self.reward_scaling = args['reward_scaling']

        self.seed = args['seed']
        self.log_dir = args['log_dir']
        self.save_dir = args['save_dir']

        try:
            os.makedirs(args['log_dir'])
            files = glob.glob(os.path.join(args['log_dir'], '*.manifest.json'))
            for f in files:
                os.remove(f)
        except OSError:
            files = glob.glob(os.path.join(args['log_dir'], '*.monitor.csv'))
            for f in files:
                os.remove(f)

        self.eval_log_dir = args['log_dir'] + "_eval"

        try:
            os.makedirs(self.eval_log_dir)
        except OSError:
            files = glob.glob(os.path.join(self.eval_log_dir, '*.monitor.csv'))
            for f in files:
                os.remove(f)

        self.envs = make_vec_envs(self.env_name, self.seed, self.num_processes,
                                  self.gamma, self.log_dir, self.device, False, self.grayscale, self.skip_frame, self.reward_scaling, num_frame_stack=self.num_frame_stack)

        self.algorithm = args['algorithm']
        # Decreasing LR scheduler
        self.scheduler = None

        if self.algorithm == 'A2C':
            actor_critic = ActorCriticNetwork(self.envs.observation_space.shape, self.envs.action_space,
                                              base_kwargs=args['policy_parameters'])
            actor_critic.to(self.device)
            self.policy = actor_critic
            self.agent = A2C(actor_critic, **args['algorithm_parameters'])

        elif self.algorithm == 'PPO':
            if(args['decreasing_lr']):
                def lambdalr(epoch): return ((float(self.epochs - epoch)) / float(self.epochs) * args['algorithm_parameters']['lr'])  # noqa: E704
                actor_critic = ActorCriticNetwork(self.envs.observation_space.shape, self.envs.action_space,
                                                  base_kwargs=args['policy_parameters'])
                actor_critic.to(self.device)
                self.policy = actor_critic
                self.agent = PPO(actor_critic, lambdalr, **
                                 args['algorithm_parameters'])
                self.scheduler = self.agent.scheduler
            else:
                actor_critic = ActorCriticNetwork(self.envs.observation_space.shape, self.envs.action_space,
                                                  base_kwargs=args['policy_parameters'])
                actor_critic.to(self.device)
                self.policy = actor_critic
                self.agent = PPO(actor_critic, None, **
                                 args['algorithm_parameters'])

        self.rollouts = RolloutStorage(self.num_steps, self.num_processes,
                                       self.envs.observation_space.shape, self.envs.action_space,
                                       actor_critic.recurrent_hidden_state_size)
        obs = self.envs.reset()
        self.rollouts.obs[0].copy_(obs)
        self.rollouts.to(self.device)
        self.episode_rewards = deque(maxlen=50)
        self.writer = SummaryWriter(
            comment="{}-{}".format(self.env_name, self.algorithm))

    def run(self):
        start = time.time()
        for epoch in range(self.epochs):
            value_losses, action_losses, dist_entropies = [], [], []
            print("\nEpoch %d\n-------" % (epoch + 1))
            for j in trange(self.num_updates_per_epoch, leave=False):
                for step in range(self.num_steps):
                    # Sample actions
                    with torch.no_grad():
                        value, action, action_log_prob, recurrent_hidden_states = self.policy.act(
                            self.rollouts.obs[step],
                            self.rollouts.recurrent_hidden_states[step],
                            self.rollouts.masks[step])

                    # Observe reward and next obs
                    obs, reward, done, infos = self.envs.step(action)
                    for info in infos:
                        if 'episode' in info.keys():
                            print("New episode")
                            self.episode_rewards.append(info['episode']['r'])

                    # If done then clean the history of observations.
                    masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                               for done_ in done])
                    self.rollouts.insert(obs, recurrent_hidden_states,
                                         action, action_log_prob, value, reward, masks)

                with torch.no_grad():
                    next_value = self.policy.get_value(self.rollouts.obs[-1],
                                                       self.rollouts.recurrent_hidden_states[-1],
                                                       self.rollouts.masks[-1]).detach()

                self.rollouts.compute_returns(
                    next_value, self.use_gae, self.gamma, self.tau)
                value_loss, action_loss, dist_entropy = self.agent.update(
                    self.rollouts)
                value_losses.append(value_loss)
                action_losses.append(action_loss)
                dist_entropies.append(dist_entropy)

                self.rollouts.after_update()

                total_num_steps = (epoch + 1) * (j + 1) * \
                    self.num_processes * self.num_steps

            end = time.time()
            print("Total timesteps: {}, FPS: {}".format(
                total_num_steps, int(total_num_steps / (end - start))))
            print("Statistic of the last %d episodes played" %
                  len(self.episode_rewards))
            if(len(self.episode_rewards) < 1):
                self.episode_rewards.append(0)
            episode_rewards_np = np.array(self.episode_rewards)
            value_losses = np.array(value_losses)
            action_losses = np.array(action_losses)
            dist_entropies = np.array(dist_entropies)
            print("Mean value loss: {}, Mean action loss: {}, Mean entropy: {}".format(
                value_losses.mean(), action_losses.mean(), dist_entropies.mean()))
            print(episode_rewards_np)
            print("Results: mean: {} +/- {}".format(np.mean(episode_rewards_np), np.std(episode_rewards_np)))
            print("Min: {}, Max: {}, Median: {}".format(np.min(episode_rewards_np), np.max(episode_rewards_np), np.median(episode_rewards_np)))

            self.writer.add_scalar(
                'value_loss/mean', value_losses.mean(), epoch)
            self.writer.add_scalar(
                'action_loss/mean', action_losses.mean(), epoch)
            self.writer.add_scalar(
                'dist_entropy/mean', dist_entropies.mean(), epoch)
            self.writer.add_scalar(
                'reward/mean', episode_rewards_np.mean(), epoch)
            self.writer.add_scalar(
                'reward/max', episode_rewards_np.max(), epoch)
            self.writer.add_scalar(
                'reward/min', episode_rewards_np.min(), epoch)

            if (epoch + 1) % self.test_every_n_epochs == 0:
                print("\nTesting...")
                bar = tqdm(total=self.num_test_episodes, leave=False)
                eval_envs = make_vec_envs(self.env_name, self.seed + self.num_processes,
                                          self.num_processes, self.gamma, self.eval_log_dir,
                                          self.device,
                                          True,
                                          self.grayscale, self.skip_frame, self.reward_scaling, num_frame_stack=self.num_frame_stack)
                vec_norm = get_vec_normalize(eval_envs)
                if vec_norm is not None:
                    vec_norm.eval()
                    vec_norm.ob_rms = get_vec_normalize(self.envs).ob_rm
                eval_episode_rewards = []
                obs = eval_envs.reset()
                eval_recurrent_hidden_states = torch.zeros(self.num_processes,
                                                           self.policy.recurrent_hidden_state_size, device=self.device)
                eval_masks = torch.zeros(
                    self.num_processes, 1, device=self.device)

                while len(eval_episode_rewards) < self.num_test_episodes:
                    with torch.no_grad():
                        _, action, _, eval_recurrent_hidden_states = self.policy.act(
                            obs, eval_recurrent_hidden_states, eval_masks, deterministic=self.use_deterministic_policy_while_testing)
                    # Obser reward and next obs
                    obs, reward, done, infos = eval_envs.step(action)
                    eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                    for done_ in done])

                    for info in infos:
                        if 'episode' in info.keys():
                            bar.update(1)
                            eval_episode_rewards.append(
                                info['episode']['r'])
                eval_envs.close()
                bar.close()
                print(eval_episode_rewards)
                print(" Evaluation using {} episodes: mean reward {:.5f}, min/max {}/{}\n".
                      format(len(eval_episode_rewards),
                             np.mean(eval_episode_rewards), np.min(eval_episode_rewards), np.max(eval_episode_rewards)))

            print("Total elapsed time: %.2f minutes" %
                  ((time.time() - start) / 60.0))
            if self.scheduler is not None:
                print("Decreasing the learning rate...")
                self.scheduler.step()

            print("Saving the model...")
            save_path = os.path.join(self.save_dir, self.algorithm)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            save_model = self.policy
            if self.device == "cuda:0":
                save_model = copy.deepcopy(self.policy).cpu()
            save_model = [save_model,
                          getattr(get_vec_normalize(self.envs), 'ob_rms', None)]
            torch.save(save_model, os.path.join(
                save_path, self.env_name + ".pt"))
Exemple #12
0
class Runner(object):
    def __init__(self,
                 net,
                 env,
                 params,
                 is_cuda=True,
                 seed=42,
                 log_dir=abspath("/data/patrik")):
        super().__init__()

        # constants
        self.timestamp = strftime("%Y-%m-%d %H_%M_%S", gmtime())
        self.seed = seed
        self.is_cuda = torch.cuda.is_available() and is_cuda

        # parameters
        self.params = params
        """Logger"""
        self.logger = TemporalLogger(self.params.env_name, self.timestamp,
                                     log_dir, *["rewards", "features"])
        self.checkpointer = AgentCheckpointer(self.params.env_name,
                                              self.params.num_updates,
                                              self.timestamp)
        """Environment"""
        self.env = env

        self.storage = RolloutStorage(self.params.rollout_size,
                                      self.params.num_envs,
                                      self.env.observation_space.shape[0:-1],
                                      self.params.n_stack,
                                      is_cuda=self.is_cuda)
        """Network"""
        self.net = net

        if self.is_cuda:
            self.net = self.net.cuda()

    def train(self):
        """Environment reset"""
        obs = self.env.reset()
        self.storage.states[0].copy_(self.storage.obs2tensor(obs))

        for num_update in range(self.params.num_updates):

            final_value, entropy = self.episode_rollout()

            self.net.optimizer.zero_grad()
            """ICM prediction """
            # tensors for the curiosity-based loss
            # feature, feature_pred: fwd_loss
            # a_t_pred: inv_loss
            icm_loss = self.net.icm(
                self.params.num_envs,
                self.storage.states.view(-1, self.params.n_stack,
                                         *self.storage.frame_shape),
                self.storage.actions.view(-1))
            """Assemble loss"""
            a2c_loss, rewards = self.storage.a2c_loss(
                final_value, entropy, self.params.value_coeff,
                self.params.entropy_coeff)

            loss = a2c_loss + icm_loss

            loss.backward(retain_graph=False)

            # gradient clipping
            nn.utils.clip_grad_norm_(self.net.parameters(),
                                     self.params.max_grad_norm)
            """Log rewards & features"""
            if len(self.storage.episode_rewards) > 1:
                self.logger.log(
                    **{
                        "rewards":
                        np.array(self.storage.episode_rewards),
                        "features":
                        self.storage.features[-1].detach().cpu().numpy()
                    })

            self.net.optimizer.step()

            # it stores a lot of data which let's the graph
            # grow out of memory, so it is crucial to reset
            self.storage.after_update()

            if len(self.storage.episode_rewards) > 1:
                self.checkpointer.checkpoint(loss,
                                             self.storage.episode_rewards,
                                             self.net)

            if num_update % 1000 == 0:
                print("current loss: ", loss.item(), " at update #",
                      num_update)
                self.storage.print_reward_stats()
                # torch.save(self.net.state_dict(), "a2c_time_log_no_norm")

        self.env.close()

        self.logger.save(*["rewards", "features"])
        self.params.save(self.logger.data_dir, self.timestamp)

    def episode_rollout(self):
        episode_entropy = 0
        for step in range(self.params.rollout_size):
            """Interact with the environments """
            # call A2C
            a_t, log_p_a_t, entropy, value, a2c_features = self.net.a2c.get_action(
                self.storage.get_state(step))
            # accumulate episode entropy
            episode_entropy += entropy

            # interact
            obs, rewards, dones, infos = self.env.step(a_t.cpu().numpy())

            # save episode reward
            self.storage.log_episode_rewards(infos)

            self.storage.insert(step, rewards, obs, a_t, log_p_a_t, value,
                                dones, a2c_features)
            self.net.a2c.reset_recurrent_buffers(reset_indices=dones)

        # Note:
        # get the estimate of the final reward
        # that's why we have the CRITIC --> estimate final reward
        # detach, as the final value will only be used as a
        with torch.no_grad():
            _, _, _, final_value, final_features = self.net.a2c.get_action(
                self.storage.get_state(step + 1))

        self.storage.features[step + 1].copy_(final_features)

        return final_value, episode_entropy
Exemple #13
0
def train_a_gym_model(env, config):
    """We train gym-type RL problem using ppo given environment and configuration"""
    torch.set_num_threads(1)

    seed = config.get('seed', None)
    log_dir = config.get('log_dir', '/tmp/gym')
    log_interval = config.get('log_interval', 10)
    save_interval = config.get('save_interval', 100)
    save_dir = config.get('save_dir', 'trained_models/ppo')
    add_timestep = config.get('add_timestep', False)
    num_processes = config.get('num_processes', 4)
    gamma = config.get('gamma', 0.99)
    num_stack = config.get('num_stack', 1)
    recurrent_policy = config.get('recurrent_policy', False)
    cuda = config.get('cuda', True)
    vis = config.get('vis', True)
    vis_interval = config.get('vis_interval', 100)
    env_name = config['env_name']
    save_step = config.get('save_step', None)
    warm_model = config.get('warm_model', None)
    if save_step is not None:
        next_save_step = save_step

    # clean the log folder, if necessary
    try:
        os.makedirs(log_dir)
    except OSError:
        files = glob.glob(os.path.join(log_dir, '*.monitor.csv'))
        for f in files:
            os.remove(f)

    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed(seed)

    if vis:
        from visdom import Visdom
        port = config.get('port', 8097)
        viz = Visdom(port=port)
        win = None

    envs = [
        make_env(env, seed, i, log_dir, add_timestep)
        for i in range(num_processes)
    ]

    if num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs, gamma=gamma)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:])

    if warm_model is None:
        actor_critic = Policy(obs_shape, envs.action_space, recurrent_policy)
    else:
        actor_critic, ob_rms, ret_rms = torch.load(warm_model)
        envs.ob_rms = ob_rms  # also use previous existing observation rms
        envs.ret_rms = ret_rms

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if cuda:
        actor_critic.cuda()

    clip_param = config.get('clip_param', 0.2)
    ppo_epoch = config.get('ppo_epoch', 4)
    num_mini_batch = config.get('num_mini_batch', 32)
    value_loss_coef = config.get('value_loss_coef', 0.5)
    entropy_coef = config.get('entropy_coef', 0.01)
    lr = config.get('lr', 1e-3)
    eps = config.get('eps', 1e-5)
    max_grad_norm = config.get('max_grad_norm', 0.5)
    use_gae = config.get('use_gae', False)
    tau = config.get('tau', 0.95)
    num_steps = config.get('num_steps', 100)
    num_frames = config.get('num_frames', 1e6)

    num_updates = int(num_frames) // num_steps // num_processes

    agent = algo.PPO(actor_critic,
                     clip_param,
                     ppo_epoch,
                     num_mini_batch,
                     value_loss_coef,
                     entropy_coef,
                     lr=lr,
                     eps=eps,
                     max_grad_norm=max_grad_norm)

    rollouts = RolloutStorage(num_steps, num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(num_processes, *obs_shape)

    obs = envs.reset()
    update_current_obs(obs, current_obs, obs_shape, num_stack)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes, 1])
    final_rewards = torch.zeros([num_processes, 1])

    if cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    def save_the_model(num=None):
        """num is additional information"""
        # save it after training
        save_path = save_dir
        try:
            os.makedirs(save_path)
        except OSError:
            pass
        # A really ugly way to save a model to CPU
        save_model = actor_critic
        if cuda:
            save_model = copy.deepcopy(actor_critic).cpu()
        save_model = [
            save_model,
            hasattr(envs, 'ob_rms') and envs.ob_rms or None,
            hasattr(envs, 'ret_rms') and envs.ret_rms or None
        ]
        if num is None:
            save_name = '%s.pt' % env_name
        else:
            save_name = '%s_at_%d.pt' % (env_name, int(num))
        torch.save(save_model, os.path.join(save_path, save_name))

    start = time.time()
    for j in range(1, 1 + num_updates):
        for step in range(num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, states = actor_critic.act(
                    rollouts.observations[step], rollouts.states[step],
                    rollouts.masks[step])
            cpu_actions = action.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs, current_obs, obs_shape, num_stack)
            rollouts.insert(current_obs, states, action, action_log_prob,
                            value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.observations[-1],
                                                rollouts.states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, use_gae, gamma, tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % save_interval == 0 and save_dir != "":
            save_the_model()
            if save_step is not None:
                total_num_steps = j * num_processes * num_steps
                if total_num_steps > next_save_step:
                    save_the_model(total_num_steps)
                    next_save_step += save_step

        if j % log_interval == 0:
            end = time.time()
            total_num_steps = j * num_processes * num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(), dist_entropy,
                        value_loss, action_loss))
        if vis and j % vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, log_dir, env_name, 'ppo',
                                  num_frames)
            except IOError:
                pass
    # finally save model again
    save_the_model()
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    tbwriter = SummaryWriter(log_dir=args.save_dir)

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    args.obs_mean, args.obs_std = get_env_mean_std(args.env_name, args.seed)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={
                              'recurrent': args.recurrent_policy,
                              'obs_mean': args.obs_mean,
                              'obs_std': args.obs_std
                          })

    if args.use_curiosity:
        # Works only for discrete actions currently
        fwd_model = ForwardModel(envs.action_space.n,
                                 state_size=512,
                                 hidden_size=256)
        inv_model = InverseModel(envs.action_space.n,
                                 state_size=512,
                                 hidden_size=256)
        fwd_model.to(device)
        inv_model.to(device)
    else:
        fwd_model = None
        inv_model = None

    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm,
                               acktr=False,
                               norm_adv=args.norm_adv,
                               use_curiosity=args.use_curiosity,
                               fwd_model=fwd_model,
                               inv_model=inv_model,
                               curiosity_beta=args.curiosity_beta,
                               curiosity_lambda=args.curiosity_lambda)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm,
                         use_curiosity=args.use_curiosity,
                         fwd_model=fwd_model,
                         inv_model=inv_model,
                         curiosity_beta=args.curiosity_beta,
                         curiosity_lambda=args.curiosity_lambda)
    elif args.algo == 'acktr':
        if args.use_curiosity:
            raise NotImplementedError
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps,
                              args.num_processes,
                              envs.observation_space.shape,
                              envs.action_space,
                              actor_critic.recurrent_hidden_state_size,
                              norm_rew=args.norm_rew)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states, actor_features = actor_critic.act_curiosity(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)
            reward = reward.to(device)

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done]).to(device)

            if args.use_curiosity:
                with torch.no_grad():
                    next_actor_features = actor_critic.get_features(
                        obs, recurrent_hidden_states, masks).detach()
                # Augment reward with curiosity rewards
                action_onehot = torch.zeros(args.num_processes,
                                            envs.action_space.n,
                                            device=device)
                action_onehot.scatter_(1, action.view(-1, 1).long(), 1)
                with torch.no_grad():
                    pred_actor_features = fwd_model(actor_features,
                                                    action_onehot).detach()
                    curiosity_rewards = 0.5 * torch.mean(F.mse_loss(
                        pred_actor_features, next_actor_features,
                        reduce=False),
                                                         dim=1).view(-1, 1)
                reward = reward + args.curiosity_eta * curiosity_rewards

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if not args.use_curiosity:
            value_loss, action_loss, dist_entropy = agent.update(rollouts,
                                                                 device=device)
        else:
            value_loss, action_loss, dist_entropy, fwd_loss, inv_loss = agent.update(
                rollouts, device=device)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs.venv, 'ob_rms') and envs.venv.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

            tbwriter.add_scalar('mean reward', np.mean(episode_rewards),
                                total_num_steps)
            tbwriter.add_scalar('median reward', np.median(episode_rewards),
                                total_num_steps)
            tbwriter.add_scalar('dist_entropy', dist_entropy, total_num_steps)
            tbwriter.add_scalar('value_loss', value_loss, total_num_steps)
            tbwriter.add_scalar('action_loss', action_loss, total_num_steps)

            if args.use_curiosity:
                print("fwd loss: {:.5f}, inv loss: {:.5f}".format(
                    fwd_loss, inv_loss))
                tbwriter.add_scalar('fwd_loss', fwd_loss, total_num_steps)
                tbwriter.add_scalar('inv_loss', inv_loss, total_num_steps)

        if args.eval_interval is not None and len(
                episode_rewards) > 1 and j % args.eval_interval == 0:
            eval_envs = make_vec_envs(args.env_name,
                                      args.seed + args.num_processes,
                                      args.num_processes, args.gamma,
                                      eval_log_dir, args.add_timestep, device,
                                      True)

            if eval_envs.venv.__class__.__name__ == "VecNormalize":
                eval_envs.venv.ob_rms = envs.venv.ob_rms

                # An ugly hack to remove updates
                def _obfilt(self, obs):
                    if self.ob_rms:
                        obs = np.clip((obs - self.ob_rms.mean) /
                                      np.sqrt(self.ob_rms.var + self.epsilon),
                                      -self.clipob, self.clipob)
                        return obs
                    else:
                        return obs

                eval_envs.venv._obfilt = types.MethodType(_obfilt, envs.venv)

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)
                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), np.mean(eval_episode_rewards)))

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
Exemple #15
0
def main(args):
    env = GymEnvironment(args, gamma)
    env.env = env.env.unwrapped

    actor_critic = Policy(obs_shape,
                          env.action_size,
                          base_kwargs={'recurrent': False})
    actor_critic.to(device)

    agent = PPO(actor_critic, clip_param, ppo_epoch, num_mini_batch,
                value_loss_coef, entropy_coef, lr, eps, max_grad_norm)
    rollouts = RolloutStorage(num_steps, num_processes, obs_shape,
                              env.action_space,
                              actor_critic.recurrent_hidden_state_size)
    current_obs = torch.zeros(num_processes, *obs_shape)

    obs, _, _, _ = env.new_expt()
    obs = obs[np.newaxis, ...]

    current_obs[:, -1] = torch.from_numpy(obs)
    rollouts.obs[0].copy_(current_obs)

    current_obs = current_obs.to(device)
    rollouts.to(device)

    num_updates = math.ceil(args.max_timesteps / (num_processes * num_steps))
    n_goal_reached = 0
    n_episodes = 0
    for j in range(num_updates):
        for step in range(num_steps):
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            cpu_actions = action.squeeze(1).cpu().numpy()

            (obs, reward, done), goal_reached = env.act(action)
            reward = torch.from_numpy(np.expand_dims(np.stack([reward]),
                                                     1)).float()

            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in [done]])

            masks = masks.to(device)

            current_obs[:, :-1] = current_obs[:, 1:]
            if done:
                current_obs[:] = 0
            current_obs[:, -1] = torch.from_numpy(obs)
            rollouts.insert(current_obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

            if done:
                n_episodes += 1
                env.new_expt()
                if goal_reached:
                    n_goal_reached += 1

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                rollouts.masks[step]).detach()

        rollouts.compute_returns(next_value, use_gae, gamma, tau, step)
        value_loss, action_loss, dist_entropy = agent.update(rollouts, step)
        rollouts.after_update()

        if j % log_interval == 0:
            total_num_steps = (j + 1) * num_processes * num_steps

            try:
                success = float(n_goal_reached) / n_episodes
            except ZeroDivisionError:
                success = 0.
            print(
                "Timesteps: {}, Goal reached : {} / {}, Success %: {}".format(
                    total_num_steps, n_goal_reached, n_episodes, success))

    if args.lang_coeff > 0:
        av_list = np.array(env.action_vectors_list)
        for k in range(len(spearman_corr_coeff_actions)):
            sr, _ = spearmanr(env.rewards_list, av_list[:, k])
            print(k, sr)
Exemple #16
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    """
    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None
    """

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                        args.gamma, args.log_dir, args.add_timestep, device, False)

    actor_critic = Policy(envs.observation_space.shape, envs.action_space,
        base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)


    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, lr=args.lr,
                               eps=args.eps, alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
                         args.value_loss_coef, args.entropy_coef, lr=args.lr,
                               eps=args.eps,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                        envs.observation_space.shape, envs.action_space,
                        actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=100)

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                        rollouts.obs[step],
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            """
            for info in infos:
                if 'episode' in info.keys():
                    print(reward)
                    episode_rewards.append(info['episode']['r'])
            """

            # FIXME: works only for environments with sparse rewards
            for idx, eps_done in enumerate(done):
                if eps_done:
                    episode_rewards.append(reward[idx])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.obs[-1],
                                                rollouts.recurrent_hidden_states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            print('Saving model')
            print()

            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [save_model, hasattr(envs.venv, 'ob_rms') and envs.venv.ob_rms or None]

            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}, success rate {:.2f}\n".
                format(
                    j, total_num_steps,
                    int(total_num_steps / (end - start)),
                    len(episode_rewards),
                    np.mean(episode_rewards),
                    np.median(episode_rewards),
                    np.min(episode_rewards),
                    np.max(episode_rewards),
                    np.count_nonzero(episode_rewards) / len(episode_rewards)
                )
            )

        if args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0:
            eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes,
                                args.gamma, eval_log_dir, args.add_timestep, device, True)

            if eval_envs.venv.__class__.__name__ == "VecNormalize":
                eval_envs.venv.ob_rms = envs.venv.ob_rms

                # An ugly hack to remove updates
                def _obfilt(self, obs):
                    if self.ob_rms:
                        obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob)
                        return obs
                    else:
                        return obs

                eval_envs.venv._obfilt = types.MethodType(_obfilt, envs.venv)

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(args.num_processes,
                            actor_critic.recurrent_hidden_state_size, device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs, eval_recurrent_hidden_states, eval_masks, deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)
                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards),
                np.mean(eval_episode_rewards)
            ))

        """
class VecEnvAgent(object):
	def __init__(self, envs, args):
		self.envs = envs
		self.args = args

		obs_shape = self.envs.observation_space.shape
		self.obs_shape = (obs_shape[0] * self.args.num_stack, *obs_shape[1:])
		
		self.actor_critic = self.select_network()
		self.optimizer = self.select_optimizer()    
		if self.args.cuda:  self.actor_critic.cuda()

		self.action_shape = 1 if self.envs.action_space.__class__.__name__ == "Discrete" \
							else self.envs.action_space.shape[0]        
		
		self.current_obs = torch.zeros(self.args.num_processes, *self.obs_shape)
		obs = self.envs.reset()
		self.update_current_obs(obs)
		
		self.rollouts = RolloutStorage(self.args.num_steps, self.args.num_processes, 
			self.obs_shape, self.envs.action_space, self.actor_critic.state_size)
		self.rollouts.observations[0].copy_(self.current_obs)

		# These variables are used to compute average rewards for all processes.
		self.episode_rewards = torch.zeros([self.args.num_processes, 1])
		self.final_rewards = torch.zeros([self.args.num_processes, 1])

		if self.args.cuda:
			self.current_obs = self.current_obs.cuda()
			self.rollouts.cuda()

		if self.args.vis:
			from visdom import Visdom
			self.viz = Visdom(port=args.port)
			self.win = None 




	def select_network(self):
		if len(self.envs.observation_space.shape) == 3:
			actor_critic = CNNPolicy(self.obs_shape[0], self.envs.action_space, 
				self.args.recurrent_policy)
		else:
			assert not self.args.recurrent_policy, \
				"Recurrent policy is not implemented for the MLP controller"
			actor_critic = MLPPolicy(self.obs_shape[0], self.envs.action_space)
			#actor_critic = BPW_MLPPolicy(obs_shape[0], self.envs.action_space)     
		return actor_critic


	def select_optimizer(self):
		if self.args.algo == 'a2c' and not self.args.use_adam:
			optimizer = optim.RMSprop(self.actor_critic.parameters(), self.args.lr, 
				eps=self.args.eps, alpha=self.args.alpha)
		elif self.args.algo == 'ppo' or self.args.algo == 'a2c':
			optimizer = optim.Adam(self.actor_critic.parameters(), self.args.lr, 
				 eps=self.args.eps)
			self.meta_optimizer = Adam_Custom(self.actor_critic.parameters(), lr=self.args.lr,eps=self.args.eps)
		elif self.args.algo == 'acktr':
			optimizer = KFACOptimizer(self.actor_critic)    
		else:
			raise TypeError("Optimizer should be any one from {a2c, ppo, acktr}")   
		return optimizer    


	def update_current_obs(self, obs):
		shape_dim0 = self.envs.observation_space.shape[0]
		obs = torch.from_numpy(obs).float()
		if self.args.num_stack > 1:
			self.current_obs[:, :-shape_dim0] = self.current_obs[:, shape_dim0:]
		self.current_obs[:, -shape_dim0:] = obs


	def run(self):
		for step in range(self.args.num_steps):
			value, action, action_log_prob, states = self.actor_critic.act(
				Variable(self.rollouts.observations[step], volatile=True),
				Variable(self.rollouts.states[step], volatile=True),
				Variable(self.rollouts.masks[step], volatile=True)
				)
			cpu_actions = action.data.squeeze(1).cpu().numpy()
			#print (cpu_actions)
			#input()

			# Obser reward and next obs
			obs, reward, done, info = self.envs.step(cpu_actions)
			reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
			self.episode_rewards += reward

			# If done then clean the history of observations.
			masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
			self.final_rewards *= masks
			self.final_rewards += (1 - masks) * self.episode_rewards
			self.episode_rewards *= masks

			if self.args.cuda: masks = masks.cuda()

			if self.current_obs.dim() == 4:
				self.current_obs *= masks.unsqueeze(2).unsqueeze(2)
			else:
				self.current_obs *= masks

			self.update_current_obs(obs)
			self.rollouts.insert(step, self.current_obs, states.data, action.data, 
				action_log_prob.data, value.data, reward, masks)
	
		next_value = self.actor_critic(
						Variable(self.rollouts.observations[-1], volatile=True),
						Variable(self.rollouts.states[-1], volatile=True),
						Variable(self.rollouts.masks[-1], volatile=True)
						)[0].data

		self.rollouts.compute_returns(next_value, self.args.use_gae, self.args.gamma, self.args.tau)
		dist_entropy, value_loss, action_loss = update(self)
		self.rollouts.after_update()
		
		return dist_entropy, value_loss, action_loss

	def meta_run(self,theta_loss,theta_grad):
		for step in range(self.args.num_steps):
			value, action, action_log_prob, states = self.actor_critic.act(
				Variable(self.rollouts.observations[step], volatile=True),
				Variable(self.rollouts.states[step], volatile=True),
				Variable(self.rollouts.masks[step], volatile=True)
				)
			cpu_actions = action.data.squeeze(1).cpu().numpy()
			#print (cpu_actions)
			#input()

			# Obser reward and next obs
			obs, reward, done, info = self.envs.step(cpu_actions)
			reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
			self.episode_rewards += reward

			# If done then clean the history of observations.
			masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
			self.final_rewards *= masks
			self.final_rewards += (1 - masks) * self.episode_rewards
			self.episode_rewards *= masks

			if self.args.cuda: masks = masks.cuda()

			if self.current_obs.dim() == 4:
				self.current_obs *= masks.unsqueeze(2).unsqueeze(2)
			else:
				self.current_obs *= masks

			self.update_current_obs(obs)
			self.rollouts.insert(step, self.current_obs, states.data, action.data, 
				action_log_prob.data, value.data, reward, masks)
	
		next_value = self.actor_critic(
						Variable(self.rollouts.observations[-1], volatile=True),
						Variable(self.rollouts.states[-1], volatile=True),
						Variable(self.rollouts.masks[-1], volatile=True)
						)[0].data

		self.rollouts.compute_returns(next_value, self.args.use_gae, self.args.gamma, self.args.tau)
		dist_entropy, value_loss, action_loss = meta_update(self,theta_loss,theta_grad)
		self.rollouts.after_update()
		
		return dist_entropy, value_loss, action_loss

	# def update_net(self,dist_entropy,value_loss,action_loss):

	# 	# self.optimizer.zero_grad()
	# 	# (value_loss + action_loss - dist_entropy * 0.01).backward()
	# 	# nn.utils.clip_grad_norm(self.actor_critic.parameters(), 0.2)
	# 	# self.optimizer.step()

	# 	update_network(self,dist_entropy,value_loss,action_loss)


	def evaluate(self,j,dist_entropy,value_loss,action_loss,model_file=None):
		end = time.time()
		total_num_steps = (j + 1) * self.args.num_processes * self.args.num_steps
		print("Updates {}, num timesteps {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
			format(j, total_num_steps,
				   self.final_rewards.mean(),
				   self.final_rewards.median(),
				   self.final_rewards.min(),
				   self.final_rewards.max(), dist_entropy.data[0],
				   value_loss.data[0], action_loss.data[0]))

		try:
			# Sometimes monitor doesn't properly flush the outputs
			self.win = visdom_plot(self.viz, self.win, self.args.log_dir, 
				self.args.env_name, self.args.algo)
		except IOError:
			pass
		

	def train(self, num_updates):
		start = time.time()
		for j in range(num_updates):
			dist_entropy, value_loss, action_loss = self.run()

			if j % self.args.save_interval == 0 and self.args.save_dir != "":
				save_path = os.path.join(self.args.save_dir, self.args.algo)
				try:
					os.makedirs(save_path)
				except OSError:
					pass

				# A really ugly way to save a model to CPU
				save_model = self.actor_critic
				if self.args.cuda:
					save_model = copy.deepcopy(self.actor_critic).cpu()

				save_model = [save_model,
								hasattr(self.envs, 'ob_rms') and self.envs.ob_rms or None]

				torch.save(save_model, os.path.join(save_path, self.args.env_name + ".pt"))

			if j % self.args.log_interval == 0:
				end = time.time()
				total_num_steps = (j + 1) * self.args.num_processes * self.args.num_steps
				print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
					format(j, total_num_steps,
						   int(total_num_steps / (end - start)),
						   self.final_rewards.mean(),
						   self.final_rewards.median(),
						   self.final_rewards.min(),
						   self.final_rewards.max(), dist_entropy.data[0],
						   value_loss.data[0], action_loss.data[0]))
			if self.args.vis and j % self.args.vis_interval == 0:
				try:
					# Sometimes monitor doesn't properly flush the outputs
					self.win = visdom_plot(self.viz, self.win, self.args.log_dir, 
						self.args.env_name, self.args.algo)
				except IOError:
					pass


	def train_maml(self, num_updates):
		start = time.time()
		theta_list = []

		num_tasks = 1000
		sample_size = 10
		 
		# episode_id: episode_id%10==0)

		# env = gym.wrappers.Monitor(self.envs, self.args.save_dir, video_callable=lambda episode_id: episode_id%10==0)

		# Create the variations needed
		task_list = []
		for i in range(num_tasks):
			friction = np.random.randint(low=1, high=10, size=3).astype('float32')/10.
			friction_1 = np.random.uniform(low=0.1, high=0.8, size=3).astype('float32')
			task = {'default/geom': ['', 'friction', '{0:.1f} {1:.1f} {2:.1f}'.format(
				friction[0],
				friction[1],
				friction[2])],
				'worldbody/body/body/geom': [[['name', 'fthigh'], ['type', 'capsule']], 
											 'friction',
											 '{0:.1f} {1:.1f} {2:.1f}'.format(
											  friction_1[0],
											  friction_1[1],
											  friction_1[2])]
			}
			# task2 = {'option': ['gravity', '{0:.2f} {1:.2f} {2:.2f}'.format(0,0,gravity_z)]}
			task_list.append(task)


		for j in range(num_updates):

			sample_indexes = np.random.randint(0, num_tasks, size=sample_size)
			# Get the theta
			if j == 0:
				theta = self.get_weights()

			# Inner loop
			# First gradient
			for i, sample_index in enumerate(sample_indexes):

				# Get the task
				task = task_list[sample_index]
				env = self.envs.venv.envs[0]

				# env = gym.wrappers.Monitor(env.env, './videos2/', video_callable=lambda episode_id: episode_id%10==0)

				_tag_names = []
				_tag_identifiers = []
				_attributes = []
				_values = []

				for k in task.keys():
					v = task[k]
					_tag_names.append(k)
					_tag_identifiers.append(v[0])
					_attributes.append(v[1])
					_values.append(v[2])

				env.env.env.my_init(_tag_names, \
									_tag_identifiers,
                                    _attributes, \
                                    _values,
									None)

				# Set the model weights to theta before training
				self.set_weights(theta)

				dist_entropy, value_loss, action_loss = self.run()

				if j == 0:
					theta_list.append(self.get_weights())
				else:
					print(i)
					theta_list[i] = self.get_weights()

			# Second gradiet
			theta_copy = deepcopy(theta)
			for k1, sample_index in enumerate(sample_indexes):

				# Get the task
				task = task_list[sample_index]
				env = self.envs.venv.envs[0]

				_tag_names = []
				_tag_identifiers = []
				_attributes = []
				_values = []

				for k in task.keys():
					v = task[k]
					_tag_names.append(k)
					_tag_identifiers.append(v[0])
					_attributes.append(v[1])
					_values.append(v[2])

				env.env.env.my_init(_tag_names, \
									_tag_identifiers,
                                    _attributes, \
                                    _values,
									None)


				# Get the network loss for this task for 1 episode
				# TODO: There should be no while loop
				# while self.a2c.n_episodes < 1:
				dist_entropy, value_loss, action_loss = self.meta_run(theta_list[k1],theta_copy)

				theta = self.get_weights()

				# Set the model weights to theta
				# self.set_weights(theta)

				# Update theta
				# Change the update network function
				# theta['state_dict'] = self.agent.update_net(theta['state_dict'],dist_entropy,value_loss,action_loss)

			# env = gym.wrappers.Monitor(env, './videos/', video_callable=lambda episode_id: episode_id%10==0,force=True)	


			if j % self.args.save_interval == 0 and self.args.save_dir != "":
				save_path = os.path.join(self.args.save_dir, self.args.algo)
				try:
					os.makedirs(save_path)
				except OSError:
					pass

				model_state = {'num_updates': j,
						    'state_dict': self.actor_critic.state_dict(),
						    'optimizer': self.meta_optimizer.state_dict()
							}
				model_state = [model_state,hasattr(self.envs, 'ob_rms') and self.envs.ob_rms or None]

				torch.save(model_state, os.path.join(save_path, self.args.env_name + 'update_'+ str(j) +".pt"))

				# # A really ugly way to save a model to CPU
				# save_model = self.actor_critic
				# if self.args.cuda:
				# 	save_model = copy.deepcopy(self.actor_critic).cpu()

				# save_model = [save_model,
				# 				hasattr(self.envs, 'ob_rms') and self.envs.ob_rms or None]

				# torch.save(save_model, os.path.join(save_path, self.args.env_name + ".pt"))

			if j % self.args.log_interval == 0:
				end = time.time()
				total_num_steps = (j + 1) * self.args.num_processes * self.args.num_steps
				print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
					format(j, total_num_steps,
						   int(total_num_steps / (end - start)),
						   self.final_rewards.mean(),
						   self.final_rewards.median(),
						   self.final_rewards.min(),
						   self.final_rewards.max(), dist_entropy.data[0],
						   value_loss.data[0], action_loss.data[0]))
			if self.args.vis and j % self.args.vis_interval == 0:
				try:
					# Sometimes monitor doesn't properly flush the outputs
					self.win = visdom_plot(self.viz, self.win, self.args.log_dir, 
						self.args.env_name, self.args.algo)
				except IOError:
					pass

	def get_weights(self):
		# state_dicts = {'id': id,
		# 			   'state_dict': self.actor_critic.state_dict(),
		# 			   }

		return self.actor_critic.state_dict()

	def set_weights(self, state_dicts):
		
		checkpoint = state_dicts

		self.actor_critic.load_state_dict(checkpoint)
Exemple #18
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]
                 )  # I guess the obs_shape[0] is channel number

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space,
                                 args.recurrent_policy)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(),
                               args.lr,
                               eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # args.num_steps should be the length of interactions before each updating/training
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy(
            )  # returns are state value, sampled action, act_log_prob, hidden states

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(
                step, current_obs, states.data, action.data,
                action_log_prob.data, value.data, reward, masks
            )  # so the rollout stores one batch of interaction sequences, each sequence has length of args.num_steps

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                Variable(rollouts.masks[:-1].view(-1, 1)),
                Variable(rollouts.actions.view(-1, action_shape)))
            # values should be values of observations, states are the hidden states used in rnn module, by pwang8

            values = values.view(
                args.num_steps, args.num_processes,
                1)  # values are estimated current state values
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            # rollouts.returns are current "Action" value calculted following Bellmans' eqaution gamma * State_value(t+1) + reward(t)
            advantages = Variable(
                rollouts.returns[:-1]
            ) - values  # This is also the definition of advantage value (action_value - state_value).
            value_loss = advantages.pow(
                2).mean()  # values are estimated current state_value(t)

            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()

            # If ACKTR is utilized, it is not only a different optimizer is used, they also added some new loss source
            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(
                    values - Variable(sample_values.data)
                ).pow(2).mean(
                )  # don't know what is the difference between this and just randomly sample some noise

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)

            optimizer.step()
        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:
                                                                      -1]  # calculating the advantage value of an action
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)

            # The difference from this ppo optimization to the optimization above is that: it updates params for
            # multiple epochs in ppo optimization. Because of this, it samples from the rollouts storage a minibatch
            # every time to calculate gradient. Sampling is conducted for optimization purpose.
            for e in range(args.ppo_epoch):
                if args.recurrent_policy:
                    data_generator = rollouts.recurrent_generator(
                        advantages, args.num_mini_batch)
                else:
                    data_generator = rollouts.feed_forward_generator(
                        advantages, args.num_mini_batch)

                for sample in data_generator:
                    observations_batch, states_batch, actions_batch, \
                       return_batch, masks_batch, old_action_log_probs_batch, \
                            adv_targ = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                        Variable(observations_batch), Variable(states_batch),
                        Variable(masks_batch), Variable(actions_batch))
                    # For the 1st epoch of updating, I guess the action_log_probls is the same as old_action_log_probs_batch
                    # because params of the NN have not been updated at that time. But later, in other updating epochs,
                    # this ratio will generate some error. The old_action_log_probs_batch will not be updated during
                    # these param updating epochs.
                    # action_log_probs is the log prob of that action taken by the agent. So it's one value here, not
                    # log_prob for all actions with certain input observation/state. By pwang8, Dec 31, 2017
                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)
                    # compared to a2c, the major difference for ppo is that action_loss is calculated in controlled way
                    value_loss = (Variable(return_batch) -
                                  values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss -
                     dist_entropy * args.entropy_coef).backward()
                    nn.utils.clip_grad_norm(actor_critic.parameters(),
                                            args.max_grad_norm)
                    optimizer.step()

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
            except IOError:
                pass
Exemple #19
0
def main():
    print("#######")
    print("WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards")
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    # envs = [make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes)]
    # env = get_test_env("001")
    envs = [lambda: get_test_env("000") for _ in range(args.num_processes)]
    # num_states = len(env.all_possible_states())
    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if len(envs.observation_space.shape) == 3:
        actor_critic = OptionCritic(num_options, obs_shape[0], envs.action_space, args.recurrent_policy)
    else:
        # assert not args.recurrent_policy, \
        #     "Recurrent policy is not implemented for the MLP controller"
        # actor_critic = MLPPolicy(obs_shape[0], envs.action_space)
        raise NotImplementedError()

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        # optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha)
        raise NotImplementedError()
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps = args.eps)
    elif args.algo == 'acktr':
        # optimizer = KFACOptimizer(actor_critic)
        raise NotImplementedError()

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size, num_options)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)
    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])
    optionSelection = 0
    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()
    start = time.time()
    #print(options)
    #print(options[0])
    for j in range(num_updates):
        options = [-1] * args.num_processes
        for step in range(args.num_steps):
            # Choose Option 
            t0 = time.time()
            selection_value, new_option, option_log_prob, states = actor_critic.get_option(Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
                   # print(new_option)
            for i in range(args.num_processes):
                if options[i] == -1:
                    options[i] = new_option[i].data[0]
            #print("option is:")
            #print(options)
            t1 = time.time()
            # Sample actions
            value, action, action_log_prob, states = actor_critic.get_output(
                    options,
                    Variable(rollouts.observations[step], volatile=True),
                    Variable(rollouts.states[step], volatile=True),
                    Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()
            t2 = time.time()
            # Termination 
            term_value, termination, termination_log_prob, _ = actor_critic.get_termination(
                options,
                Variable(rollouts.observations[step], volatile=True),
                    Variable(rollouts.states[step], volatile=True),
                    Variable(rollouts.masks[step], volatile=True))
            termination = torch.LongTensor([termination[i].data[0] for i in range(termination.shape[0])])
            t3 = time.time()
            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            episode_rewards += reward

            # newIndex = obs_to_int(obs)

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks


            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks
            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks, options, termination)
            
            for i in range(termination.shape[0]):
                if termination[i] == 1:
                    options[i] = -1
            t4 = time.time()
            #print("part1")
            #print(t1 - t0)
            #print("part2")
            #print(t2-t1)
            #print("part3")
            #print(t3-t2)
            #print("part4")
            #print(t4-t3)
        for i in range(args.num_processes):
            if options[i]== -1:
                selection_value, new_option, option_log_prob, states = actor_critic.get_option(Variable(rollouts.observations[step], volatile=True),
                    Variable(rollouts.states[step], volatile=True),
                    Variable(rollouts.masks[step], volatile=True))
                # print(new_option)
            options[i] = new_option[i].data[0]
        rollouts.options[step+1].copy_(torch.LongTensor(options))
        next_value = actor_critic.get_output(options,Variable(rollouts.observations[-1], volatile=True),
                                  Variable(rollouts.states[-1], volatile=True),
                                  Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        if args.algo in ['a2c', 'acktr']:
            raise NotImplementedError()
        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)

            for e in range(args.ppo_epoch):
                for i in range(args.num_steps):
                    # Get the ith step during exploration
                    options = rollouts.options[i]
                    #print(options)
                    adv_targ = Variable(advantages[i])
                    old_action_log_probs = rollouts.action_log_probs[i]
                    termination = rollouts.optionSelection[i]
                    #print(termination)
                    # Use critic value of option nn to update option parameters
                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_option(
                        Variable(rollouts.observations[i]),
                        Variable(rollouts.states[i]),
                        Variable(rollouts.masks[i]),
                        Variable(rollouts.actions[i]), options)
                    #print(action_log_probs)
                    ratio = torch.exp(action_log_probs - Variable(old_action_log_probs))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP)
                    value_loss = (Variable(rollouts.returns[i]) - values).pow(2).mean()

                    selection_log_prob = actor_critic.evaluate_selection(
                        Variable(rollouts.observations[i]),
                        Variable(rollouts.states[i]),
                        Variable(rollouts.masks[i]),
                        Variable(termination),
                        Variable(rollouts.options[i].type(torch.cuda.LongTensor)))
                    V_Omega = selection_log_prob * values 

                    # Update termination parameters 
                    termination_log_prob = actor_critic.evaluate_termination(
                        Variable(rollouts.observations[i]),
                        Variable(rollouts.states[i]),
                        Variable(rollouts.masks[i]),
                        Variable(termination.type(torch.cuda.LongTensor)),
                        rollouts.options[i+1])
                    left_values = []
                    right_values = []
                    for i in range(args.num_processes):
                        if int(termination[i]) == 1:
                            left_values.append(V_Omega[i])
                            right_values.append(values[i])
                        elif int(termination[i]) == 0:
                            left_values.append(values[i])
                            right_values.append(V_Omega[i])
                    left_values = torch.cat(left_values)
                    right_values = torch.cat(right_values)
                    termination_loss = (- torch.exp(termination_log_prob) * left_values - (1 - torch.exp(termination_log_prob)) * right_values).mean()
                    optimizer.zero_grad()

                    (action_loss + value_loss+ termination_loss - V_Omega.mean()).backward()
                    nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [save_model,
                            hasattr(envs, 'ob_rms') and envs.ob_rms or None]

            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(j, total_num_steps,
                       int(total_num_steps / (end - start)),
                       final_rewards.mean(),
                       final_rewards.median(),
                       final_rewards.min(),
                       final_rewards.max(), dist_entropy.data[0],
                       value_loss.data[0], action_loss.data[0]))
            writer.add_scaler("final_reward_max", final_rewards.max(), plot_index)
            plot_index += 1
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                print("hit")
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
def main():
    '''
    Train PPO policies on each of the training environments.
    '''
    args = get_args()

    try:
        os.makedirs(args.log_dir)
    except OSError:
        pass

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args, device)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': False})
    actor_critic.to(device)

    agent = algo.PPO(actor_critic,
                     args.clip_param,
                     args.ppo_epoch,
                     args.num_mini_batch,
                     args.value_loss_coef,
                     args.entropy_coef,
                     lr=args.lr,
                     eps=args.eps,
                     max_grad_norm=args.max_grad_norm)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    ep_reward = np.zeros(args.num_processes)
    episode_rewards = deque(maxlen=100)
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    for j in range(num_updates):
        # decrease learning rate linearly
        utils.update_linear_schedule(agent.optimizer, j, num_updates, args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obs reward and next obs
            obs, reward, done, infos = envs.step(action)
            if 'spaceship' in args.env_name:  # spaceship, swimmer
                for i in range(len(done)):
                    if done[i]:
                        episode_rewards.append(reward[i].item())
            # elif 'swimmer' in args.env_name:
            else:
                for i in range(len(done)):
                    ep_reward[i] += reward[i].numpy().item()
                    if done[i]:
                        episode_rewards.append(ep_reward[i])
                        ep_reward[i] = 0
            # if 'ant' in args.env_name:
            #     for info in infos:
            #         if 'episode' in info.keys():
            #             episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, True, args.gamma, args.gae_lambda,
                                 True)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            try:
                os.makedirs(args.save_dir)
            except OSError:
                pass


            torch.save(
                actor_critic.state_dict(),
                os.path.join(args.save_dir, "ppo.{}.env{}.seed{}.pt"\
                    .format(args.env_name, args.default_ind, args.seed))
            )

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print("\nUpdates {}, num timesteps {}, Last {} training episodes: \
                \n mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}"
                  .format(j, total_num_steps, len(episode_rewards),
                          np.mean(episode_rewards), np.median(episode_rewards),
                          np.min(episode_rewards), np.max(episode_rewards)))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, device)

    envs.close()
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    torch.set_num_threads(1)

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    if args.num_processes > 1:
        if args.retro_contest == True:
            import json
            sonic_env_confs = json.load(open(args.sonic_config_file, 'r'))
            sonic_env_confs = sonic_env_confs['Train']
            sonic_env_confs = [v for _, v in sonic_env_confs.items()]
            envs = SubprocVecSonicEnv(sonic_env_confs, args.num_processes)
        else:
            envs = [
                make_env(args.env_name, args.seed, i, args.log_dir,
                         args.add_timestep) for i in range(args.num_processes)
            ]
            envs = SubprocVecEnv(envs)
    else:
        envs = [
            make_env(args.env_name, args.seed, i, args.log_dir,
                     args.add_timestep) for i in range(args.num_processes)
        ]
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])
    prev_saved_rew_median = float('-inf')
    actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy)
    if args.load_model:
        model_path = os.path.join(args.save_dir, args.algo,
                                  args.env_name) + ".pt"
        actor_critic, ob_rms, prev_saved_rew_median = torch.load(model_path)
        print("Loaded actor_critic model from:", model_path,
              "which got a median score of:", prev_saved_rew_median)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    prev_reward = 0.0
    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, states = actor_critic.act(
                    rollouts.observations[step], rollouts.states[step],
                    rollouts.masks[step])
            cpu_actions = action.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(current_obs, states, action, action_log_prob,
                            value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.observations[-1],
                                                rollouts.states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and final_rewards.median(
        ) > prev_saved_rew_median and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None,
                final_rewards.median()
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))
            prev_saved_rew_median = final_rewards.median()
            # Save a separate copy just in case the main saved model ends up being worser.
            # Helps to have a few saved models to choose from at test/runtime
            torch.save(
                save_model,
                os.path.join(
                    save_path,
                    args.env_name + str(final_rewards.median()) + '.pt'))
            print("Saved the state which got a median reward of",
                  prev_saved_rew_median)

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(), dist_entropy,
                        value_loss, action_loss))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
Exemple #22
0
def main(args):
    env = GymEnvironment(args, gamma)
    env.env = env.env.unwrapped

    actor_critic = Policy(obs_shape,
                          env.action_size,
                          base_kwargs={'recurrent': False})
    actor_critic.to(device)

    agent = PPO(actor_critic, clip_param, ppo_epoch, num_mini_batch,
                value_loss_coef, entropy_coef, lr, eps, max_grad_norm)
    rollouts = RolloutStorage(num_steps, num_processes, obs_shape,
                              env.action_space,
                              actor_critic.recurrent_hidden_state_size)
    current_obs = torch.zeros(num_processes, *obs_shape)

    obs, _, _, _ = env.new_expt()
    obs = obs[np.newaxis, ...]

    current_obs[:, -1] = torch.from_numpy(obs)
    rollouts.obs[0].copy_(current_obs)

    current_obs = current_obs.to(device)
    rollouts.to(device)

    num_updates = math.ceil(args.max_timesteps / (num_processes * num_steps))
    n_goal_reached = 0
    n_episodes = 0
    for j in tqdm(range(num_updates), ascii=True):
        for step in range(num_steps):
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            cpu_actions = action.squeeze(1).cpu().numpy()

            (obs, reward, done), goal_reached = env.act(action)
            reward = torch.from_numpy(np.expand_dims(np.stack([reward]),
                                                     1)).float()

            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in [done]])

            masks = masks.to(device)

            current_obs[:, :-1] = current_obs[:, 1:]
            if done:
                current_obs[:] = 0
            current_obs[:, -1] = torch.from_numpy(obs)
            rollouts.insert(current_obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

            if done:
                n_episodes += 1
                env.new_expt()
                if goal_reached:
                    n_goal_reached += 1

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                rollouts.masks[step]).detach()

        rollouts.compute_returns(next_value, use_gae, gamma, tau, step)
        value_loss, action_loss, dist_entropy = agent.update(rollouts, step)
        rollouts.after_update()

        torch.save(agent.actor_critic.state_dict(), 'log/model.pt')
Exemple #23
0
def main():
    device = 'cpu'
    acc_steps = []
    acc_scores = []
    torch.set_num_threads(1)
    print('here')

    if args.env_name == 'Reacher-v2':
        rbf1 = build_features_reacher2(.2, 5, 2)
        len_rbf = rbf1._K
        len_features = len_rbf + 1
    if args.env_name == 'Hopper-v2':
        len_features = 3
    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    actor_critic = Policy(envs.observation_space.shape, envs.action_space)

    actor_critic.to(device)

    agent = PPO(actor_critic,
                args.clip_param,
                args.ppo_epoch,
                args.num_mini_batch,
                args.value_loss_coef,
                args.entropy_coef,
                lr=args.lr,
                eps=args.eps,
                max_grad_norm=args.max_grad_norm)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              len_features)
    print('here2')
    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)
    episode_rewards = collections.deque(maxlen=10)
    num_updates = 20
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            update_linear_schedule(agent.optimizer, j, num_updates, args.lr)
            agent.clip_param = args.clip_param * (1 - j / float(num_updates))

        # Prepare demos
        demo_actions = np.zeros(
            (1, args.num_processes, envs.action_space.shape[0]))
        demo_states = np.zeros(
            (1, args.num_processes, envs.observation_space.shape[0]))

        demo_features = np.zeros((1, args.num_processes, len_features))
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob = actor_critic.act(
                    rollouts.obs[step], rollouts.masks[step])

            # obs, reward and next obs
            demo_actions = np.concatenate(
                [demo_actions,
                 action.reshape(1, args.num_processes, -1)], 0)
            demo_states = np.concatenate([
                demo_states, rollouts.obs[step].reshape(
                    1, args.num_processes, -1)
            ], 0)
            feat_rewards = np.zeros((args.num_processes, len_features))
            if args.env_name == 'Hopper-v2':
                if args.num_processes > 1:
                    pos_before = envs.get_sim_data()
            obs, reward, done, infos = envs.step(action)
            if args.env_name == 'Hopper-v2':
                if args.num_processes > 1:
                    pos_after = envs.get_sim_data()
                    for num_p in range(args.num_processes):
                        feat_1 = pos_after[num_p] - pos_before[num_p]
                        feat_2 = 0
                        if not done[num_p]:
                            feat_2 = 1
                        # feat_2 = np.array([1 for _ in range(args.num_processes)])
                        feat_3 = np.array(
                            [np.linalg.norm(action[num_p],
                                            ord=2)**2]).flatten()
                        feat_rewards[num_p] = np.array(
                            [feat_1, feat_2, feat_3])
            if args.env_name == 'Reacher-v2':
                if args.num_processes > 1:
                    body_data = envs.get_body_data()
                    for num_p in range(args.num_processes):
                        rbf1_ = rbf1(body_data[num_p][:-1])
                        rbf4_ = np.array(
                            [np.linalg.norm(action[num_p], ord=2)**2])
                        feat_rewards[num_p] = np.concatenate(
                            (rbf1_.reshape(-1), rbf4_))
                else:
                    rbf1_ = rbf1(
                        (envs.envs[0].env.env.get_body_com("fingertip") -
                         envs.envs[0].env.env.get_body_com("target"))[:-1])
                    rbf4_ = np.array([-np.square(action[0]).sum()])
                    feat_rewards[0] = np.concatenate(
                        (rbf1_.reshape(-1), rbf4_))
            demo_features = np.concatenate([
                demo_features,
                feat_rewards.reshape(1, args.num_processes, -1)
            ], 0)
            if step > 1 and step % 1000 == 0:
                done = [True for _ in range(args.num_processes)]

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, action, action_log_prob, \
                            value, reward, masks, feat_rewards)

        # Save demos:
        action_file_name = demos_expe_dir + '/actions_step_' + str(j) + '.npy'
        state_file_name = demos_expe_dir + '/states_step_' + str(j) + '.npy'
        rew_feat_file_name = demos_expe_dir + '/rew_feat_step_' + str(
            j) + '.npy'
        policy_file_name = demos_expe_dir + '/policy_step_' + str(j) + '.pth'
        np.save(action_file_name, demo_actions)
        np.save(state_file_name, demo_states)
        np.save(rew_feat_file_name, demo_features)
        torch.save(actor_critic.state_dict(), policy_file_name)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.obs[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.gamma, args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir:
            save_path = os.path.join(args.save_dir, 'ppo')
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + '.pt'))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            print('Updates', j, 'num timesteps', len(episode_rewards),
                  '\n Last training episodes: mean/median reward',
                  '{:.1f}'.format(np.mean(episode_rewards)),
                  '/{:.1f}'.format(np.median(episode_rewards)),
                  'min/max reward', '{:.1f}'.format(np.min(episode_rewards)),
                  '/{:.1f}'.format(np.max(episode_rewards)), 'dist entropy',
                  dist_entropy, 'value loss', value_loss, 'action loss',
                  action_loss)

        if len(episode_rewards) > 1:
            acc_steps.append(total_num_steps)
            acc_scores.append(np.mean(episode_rewards))
            #print(acc_scores)

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(args.env_name,
                                      args.seed + args.num_processes,
                                      args.num_processes, args.gamma,
                                      eval_log_dir, args.add_timestep, device,
                                      True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _ = actor_critic.act(obs,
                                                    eval_masks,
                                                    deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)
                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print('Evaluation using', len(eval_episode_rewards),
                  'episodes: mean reward',
                  '{:.5f}\n'.format(np.mean(eval_episode_rewards)))

    scores_file_name = args.scores_dir + '/learner_scores_' + args.env_name + '_' + args.expe + '.npy'
    steps_file_name = args.scores_dir + '/learner_steps_' + args.env_name + '_' + args.expe + '.npy'
    np.save(scores_file_name, np.array(acc_scores))
    np.save(steps_file_name, np.array(acc_steps))
Exemple #24
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    utils.cleanup_log_dir(log_dir)

    with open(log_dir + 'extras.csv', "w") as file:
        file.write("n, value_loss\n")

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, False)

    model = Policy(envs.observation_space.shape,
                   envs.action_space.n,
                   extra_kwargs={'use_backpack': args.algo == 'tdprop'})
    model.to(device)

    if args.algo == 'tdprop':
        from algo.sarsa_tdprop import SARSA
        agent = SARSA(model,
                      lr=args.lr,
                      eps=args.eps,
                      max_grad_norm=args.max_grad_norm,
                      beta_1=args.beta_1,
                      beta_2=args.beta_2,
                      n=args.num_steps,
                      num_processes=args.num_processes,
                      gamma=args.gamma)
    else:
        from algo.sarsa import SARSA
        agent = SARSA(model,
                      lr=args.lr,
                      eps=args.eps,
                      max_grad_norm=args.max_grad_norm,
                      beta_1=args.beta_1,
                      beta_2=args.beta_2,
                      algo=args.algo)

    explore_policy = utils.eps_greedy
    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              model.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                qs = model(rollouts.obs[step])
                _, dist = explore_policy(qs, args.exploration)
                actions = dist.sample().unsqueeze(-1)
                value = qs.gather(-1, actions)

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(actions)
            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, torch.FloatTensor([0.0]), actions, value,
                            value, reward, masks, bad_masks)
        with torch.no_grad():
            next_qs = model(rollouts.obs[-1])
            next_probs, _ = explore_policy(next_qs, args.exploration)
            next_value = (next_probs * next_qs).sum(-1).unsqueeze(-1)

        rollouts.compute_returns(next_value, args.gamma)

        value_loss = agent.update(rollouts, explore_policy, args.exploration)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0 or j == num_updates - 1):
            save_path = os.path.join(args.log_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass
            torch.save([
                list(model.parameters()),
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                    ("Updates {}, num timesteps {}, FPS {}\n" + \
                            "Last {} training episodes: mean/median reward {:.1f}/{:.1f}" + \
                            ", min/max reward {:.1f}/{:.1f}\n" + \
                            "entropy {:.2f}, value loss {:.4f}")
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist.entropy().mean().item(), value_loss))
            with open(log_dir + 'extras.csv', "a") as file:
                file.write(
                    str(total_num_steps) + ", " + str(value_loss) + "\n")
def main():

    is_limit_action = True
    # is_limit_action = False
    args_cuda = True
    # args_cuda = False

    torch.manual_seed(args_seed)
    torch.cuda.manual_seed_all(args_seed)

    device = torch.device("cuda:0" if args_cuda else "cpu")

    train_log = Log(log_name + '_train_log')
    evl_log = Log(log_name + '_evaluation_log')
    torch.set_num_threads(1)
    envs = make_vec_envs(args_env_name,
                         args_seed,
                         args_num_processes,
                         device,
                         gamma=args_gamma)
    if is_limit_action:
        envs.action_space.n = 3
    print('Number of Actions:', envs.action_space.n)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args_recurrent_policy})
    actor_critic.to(device)
    # print(actor_critic.is_recurrent)
    # print(actor_critic.gru)
    # ss('hi')

    agent = PPO(actor_critic,
                args_clip_param,
                args_ppo_epoch,
                args_num_mini_batch,
                args_value_loss_coef,
                args_entropy_coef,
                lr=args_lr,
                eps=args_eps,
                max_grad_norm=args_max_grad_norm,
                use_clipped_value_loss=args_use_clipped_value_loss)

    rollouts = RolloutStorage(args_num_steps, args_num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)
    # print(obs)
    # ss('i am over it')
    num_updates = int(
        args_num_env_steps) // args_num_steps // args_num_processes

    episode_rewards = deque(maxlen=10)
    start = time.time()
    sum_re = torch.zeros(args_num_processes, 1)

    for j in range(num_updates):

        for step in range(args_num_steps):

            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])
            # ss('dissecting actor critic. act')
            # print(action)
            # print()
            # action = action + 1
            # print(action)
            # ss('hoiohasdfhioas')
            if is_limit_action:
                obs, reward, done, infos = envs.step(action + 1)
            else:
                obs, reward, done, infos = envs.step(action)
            sum_re += reward

            if any(done):

                for i in range(len(done)):
                    if done[i]:
                        episode_rewards.append(sum_re[i].item())
                        # print(done)
                        # print(sum_re[i])
                        sum_re[i] *= 0
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)
        with torch.no_grad():

            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args_gamma, args_use_gae,
                                 args_gae_lambda)
        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        rollouts.after_update()

        if j % args_log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args_num_processes * args_num_steps
            end = time.time()
            logstring = "E {}, N_steps {}, FPS {} mean/median" \
                        " {:.1f}/{:.1f}, min/max {:.1f}/{:.1f}" \
                        " Entropy {:.5f},V {:.5f},Action {:.5f}".format(
                j, total_num_steps,
                            int(total_num_steps / (end - start)),
                            np.mean(episode_rewards),
                            np.median(episode_rewards), np.min(episode_rewards),
                            np.max(episode_rewards),
                            dist_entropy, value_loss,
                            action_loss)
            # print(logstring)
            train_log.log(logstring)
        # if True:
        if (args_eval_interval is not None and len(episode_rewards) > 1
                and j % args_eval_interval == 0):
            total_num_steps = (j + 1) * args_num_processes * args_num_steps
            ob_rms = get_vec_normalize(envs).ob_rms
            ev_result = evaluate(actor_critic,
                                 ob_rms,
                                 args_env_name,
                                 args_seed,
                                 args_num_processes,
                                 device,
                                 is_limit_action=is_limit_action)
            ev_log_string = 'steps:' + str(total_num_steps) + '. ' + ev_result
            evl_log.log(ev_log_string)
Exemple #26
0
                surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                    1.0 + args.clip_param) * adv_targ
                action_loss = -torch.min(
                    surr1,
                    surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)

                value_loss = (Variable(return_batch) - values).pow(2).mean()

                optimizer.zero_grad()
                (value_loss + action_loss -
                 dist_entropy.mean() * args.entropy_coef).backward()
                nn.utils.clip_grad_norm(agent.parameters(), args.max_grad_norm)
                optimizer.step()
                ppo_update += 1

            if ppo_update // args.ppo_epoch % 5 == 0:
                writer.add_scalar('value_loss',
                                  value_loss.data.cpu().numpy(), ppo_update)
                writer.add_scalar('action_loss',
                                  action_loss.data.cpu().numpy(), ppo_update)
                writer.add_scalar('entropy_loss',
                                  dist_entropy.mean().data.cpu().numpy(),
                                  ppo_update)

                # Save model
                torch.save(
                    agent.model.state_dict(),
                    "saved_weights/saved_model_ppo_epoch_" + str(ppo_update))

    rollouts.after_update()
Exemple #27
0
def main():
    print("#######")
    print("WARNING: All rewards are clipped or normalized so you need to use a monit`or (see envs.py) or visdom plot to get true rewards")
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'


    logger = Logger(algorithm_name = args.algo, environment_name = args.env_name, folder = args.folder)
    logger.save_args(args)

    print ("---------------------------------------")
    print ('Saving to', logger.save_folder)
    print ("---------------------------------------")    


    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = [make_env(args.env_name, args.seed, i, args.log_dir)
                for i in range(args.num_processes)]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(Variable(rollouts.observations[step], volatile=True),
                                                                      Variable(rollouts.states[step], volatile=True),
                                                                      Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks)

        next_value = actor_critic(Variable(rollouts.observations[-1], volatile=True),
                                  Variable(rollouts.states[-1], volatile=True),
                                  Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)


        advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)

        for e in range(args.ppo_epoch):
            data_generator = rollouts.feed_forward_generator(advantages, args.num_mini_batch)

            for sample in data_generator:
                observations_batch, states_batch, actions_batch, \
                   return_batch, masks_batch, old_action_log_probs_batch, \
                        adv_targ = sample

                # Reshape to do in a single forward pass for all steps
                values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(observations_batch),
                                                                                               Variable(states_batch),
                                                                                               Variable(masks_batch),
                                                                                               Variable(actions_batch))

                adv_targ = Variable(adv_targ)
                ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch))
                surr1 = ratio * adv_targ
                surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ
                action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP)

                value_loss = (Variable(return_batch) - values).pow(2).mean()

                optimizer.zero_grad()
                (value_loss + action_loss - dist_entropy * args.entropy_coef).backward()
                nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)
                optimizer.step()

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [save_model,
                            hasattr(envs, 'ob_rms') and envs.ob_rms or None]

            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(j, total_num_steps,
                       int(total_num_steps / (end - start)),
                       final_rewards.mean(),
                       final_rewards.median(),
                       final_rewards.min(),
                       final_rewards.max(), dist_entropy.data[0],
                       value_loss.data[0], action_loss.data[0]))

            final_rewards_mean = [final_rewards.mean()]
            final_rewards_median = [final_rewards.median()]
            final_rewards_min = [final_rewards.min()]
            final_rewards_max = [final_rewards.max()]

            logger.record_data(final_rewards_mean, final_rewards_median, final_rewards_min, final_rewards_max)
            logger.save()


        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo)
            except IOError:
                pass
Exemple #28
0
def main():
    """
    主程序
    :return:
    """
    num_cls = args.wave_num * args.k + 1  # 所有的路由和波长选择组合,加上啥都不选
    action_shape = 1  # action的维度,默认是1.
    num_updates = int(
        args.steps) // args.workers // args.num_steps  # 梯度一共需要更新的次数
    if args.append_route.startswith("True"):
        channel_num = args.wave_num + args.k
    else:
        channel_num = args.wave_num

    # 解析weight
    if args.weight.startswith('None'):
        weight = None
    else:
        weight = args.weight
    # 创建actor_critic
    if args.mode.startswith('alg'):
        # ksp(args, weight)
        return
    elif args.mode.startswith('learning'):
        # CNN学习模式下,osb的shape应该是CHW
        obs_shape = (channel_num, args.img_height, args.img_width)
        if args.cnn.startswith('mobilenetv2'):
            actor_critic = MobileNetV2(in_channels=channel_num,
                                       num_classes=num_cls,
                                       t=6)
        elif args.cnn.startswith('simplenet'):
            actor_critic = SimpleNet(in_channels=channel_num,
                                     num_classes=num_cls)
        elif args.cnn.startswith('simplestnet'):
            actor_critic = SimplestNet(in_channels=channel_num,
                                       num_classes=num_cls)
        elif args.cnn.startswith('alexnet'):
            actor_critic = AlexNet(in_channels=channel_num,
                                   num_classes=num_cls)
        elif args.cnn.startswith('squeezenet'):
            actor_critic = SqueezeNet(in_channels=channel_num,
                                      num_classes=num_cls,
                                      version=1.0)
        elif args.cnn.startswith('expandsimplenet'):
            actor_critic = ExpandSimpleNet(in_channels=channel_num,
                                           num_classes=num_cls,
                                           expand_factor=args.expand_factor)
        elif args.cnn.startswith('deepersimplenet'):
            actor_critic = DeeperSimpleNet(in_channels=channel_num,
                                           num_classes=num_cls,
                                           expand_factor=args.expand_factor)
        else:
            raise NotImplementedError

        # 创建optimizer
        if args.algo.startswith("a2c"):
            optimizer = optim.RMSprop(actor_critic.parameters(),
                                      lr=args.base_lr,
                                      eps=args.epsilon,
                                      alpha=args.alpha)
        elif args.algo.startswith("ppo"):
            optimizer = optim.Adam(actor_critic.parameters(),
                                   lr=args.base_lr,
                                   eps=args.epsilon)
        else:
            raise NotImplementedError
    else:
        raise NotImplementedError

    if args.cuda.startswith("True"):
        # 如果要使用cuda进行计算
        actor_critic.cuda()
        # actor_critic = DistModule(actor_critic)

    # 判断是否是评估模式
    if args.evaluate:
        print("evaluate mode")
        models = {}
        times = 1
        prefix = "trained_models"
        directory = os.path.join(prefix, 'a2c', args.cnn, args.step_over)
        env = RwaGame(net_config=args.net,
                      wave_num=args.wave_num,
                      rou=args.rou,
                      miu=args.miu,
                      max_iter=args.max_iter,
                      k=args.k,
                      mode=args.mode,
                      img_width=args.img_width,
                      img_height=args.img_height,
                      weight=weight,
                      step_over=args.step_over)

        for model_file in reversed(
                sorted(os.listdir(directory),
                       key=lambda item: int(item.split('.')[0]))):
            model_file = os.path.join(directory, model_file)
            print("evaluate model {}".format(model_file))
            params = torch.load(model_file)
            actor_critic.load_state_dict(params['state_dict'])
            actor_critic.eval()

            models[params['update_i']] = {}

            print("model loading is finished")
            for t in range(times):
                total_reward, total_services, allocated_services = 0, 0, 0
                obs, reward, done, info = env.reset()
                while not done:
                    inp = Variable(torch.Tensor(obs).unsqueeze(0),
                                   volatile=True)  # 禁止梯度更新
                    value, action, action_log_prob = actor_critic.act(
                        inputs=inp, deterministic=True)  # 确定性决策
                    action = action.data.numpy()[0]
                    obs, reward, done, info = env.step(action=action[0])
                    total_reward += reward
                    if reward == ARRIVAL_NEWPORT or reward == ARRIVAL_NOPORT:
                        allocated_services += 1
                    if args.step_over.startswith('one_time'):
                        if info:
                            total_services += 1
                    elif args.step_over.startswith('one_service'):
                        total_services += 1
                    else:
                        raise NotImplementedError
                models[params['update_i']]['time'] = t
                models[params['update_i']]['reward'] = total_reward
                models[params['update_i']]['total_services'] = total_services
                models[params['update_i']][
                    'allocated_services'] = allocated_services
                models[params['update_i']]['bp'] = (
                    total_services - allocated_services) / total_services
        # 输出仿真结果
        # print("|updated model|test index|reward|bp|total services|allocated services|")
        # print("|:-----|:-----|:-----|:-----|:-----|:-----|")
        # for m in sorted(models):
            for i in range(times):
                print("|{up}|{id}|{r}|{bp:.4f}|{ts}|{als}|".format(
                    up=params['update_i'],
                    id=models[params['update_i']]['time'],
                    r=models[params['update_i']]['reward'],
                    bp=models[params['update_i']]['bp'],
                    ts=models[params['update_i']]['total_services'],
                    als=models[params['update_i']]['allocated_services']))
        return

    # 创建游戏环境
    envs = [
        make_env(net_config=args.net,
                 wave_num=args.wave_num,
                 k=args.k,
                 mode=args.mode,
                 img_width=args.img_width,
                 img_height=args.img_height,
                 weight=weight,
                 step_over=args.step_over) for _ in range(args.workers)
    ]
    envs = SubprocEnv(envs)
    # 创建游戏运行过程中相关变量存储更新的容器
    rollout = RolloutStorage(num_steps=args.num_steps,
                             num_processes=args.workers,
                             obs_shape=obs_shape,
                             action_shape=action_shape)
    current_obs = torch.zeros(args.workers, *obs_shape)

    observation, _, _, _ = envs.reset()
    update_current_obs(current_obs, observation, channel_num)

    rollout.observations[0].copy_(current_obs)
    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.workers, 1])
    final_rewards = torch.zeros([args.workers, 1])

    if args.cuda.startswith("True"):
        current_obs = current_obs.cuda()
        rollout.cuda()

    start = time.time()
    log_start = time.time()
    total_services = 0  # log_interval期间一共有多少个业务到达
    allocated_services = 0  # log_interval期间一共有多少个业务被分配成功
    update_begin = 0

    # 判断是否是接续之前的训练
    if args.resume:
        pms = torch.load(args.resume)
        actor_critic.load_state_dict(pms['state_dict'])
        optimizer.load_state_dict(pms['optimizer'])
        update_begin = pms['update_i']
        print("resume process from update_i {}, with base_lr {}".format(
            update_begin, args.base_lr))

    for updata_i in range(update_begin, num_updates):
        update_start = time.time()
        for step in range(args.num_steps):
            # 选择行为
            inp = Variable(rollout.observations[step], volatile=True)  # 禁止梯度更新
            value, action, action_log_prob = actor_critic.act(
                inputs=inp, deterministic=False)
            # print(action)
            # 压缩维度,放到cpu上执行。因为没有用到GPU,所以并没有什么卵用,权当提示
            cpu_actions = action.data.squeeze(1).cpu().numpy()
            # 观察observation,以及下一个observation
            envs.step_async(cpu_actions)
            obs, reward, done, info = envs.step_wait(
            )  # reward和done都是(n,)的numpy.ndarray向量
            #  if reward == ARRIVAL_NEWPORT_NEWPORT or reward == ARRIVAL_NOPORT_NEWPORT or reward == ARRIVAL_NOPORT_NOPORT:
            #     allocated_services += 1
            print(reward)
            for i in reward:
                if i == ARRIVAL_NEWPORT or i == ARRIVAL_NOPORT:
                    allocated_services += 1
        #  allocated_services += (reward==ARRIVAL_NEWPORT_NEWPORT or reward==ARRIVAL_NOPORT_NEWPORT or reward==ARRIVAL_NOPORT_NOPORT).any().sum()  # 计算分配成功的reward的次数
        # TODO 未解决
            if args.step_over.startswith('one_service'):
                total_services += (info == True).sum()  # 计算本次step中包含多少个业务到达事件
            # elif args.step_over.startswith('one_service'):
            #     total_services += args.workers
            else:
                raise NotImplementedError
            reward = torch.from_numpy(np.expand_dims(reward, 1)).float()
            episode_rewards += reward  # 累加reward分数
            # 如果游戏结束,则重新开始计算episode_rewards和final_rewards,并且以返回的reward为初始值重新进行累加。
            masks = torch.FloatTensor([[0.0] if d else [1.0] for d in done
                                       ])  # True --> 0, False --> 1
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks
            #            if done[len(done)-1]:
            #               print('游戏结束最终端口数量:',envs.get_all_edges_port())

            if args.cuda.startswith("True"):
                masks = masks.cuda()

            # 给masks扩充2个维度,与current_obs相乘。则运行结束的游戏进程对应的obs值会变成0,图像上表示全黑,即游戏结束的画面。
            current_obs *= masks.unsqueeze(2).unsqueeze(2)
            update_current_obs(current_obs=current_obs,
                               obs=obs,
                               channel_num=channel_num)
            # 把本步骤得到的结果存储起来
            rollout.insert(step=step,
                           current_obs=current_obs,
                           action=action.data,
                           action_log_prob=action_log_prob.data,
                           value_pred=value.data,
                           reward=reward,
                           mask=masks)

        # TODO 强行停止
        # envs.close()
        # return

        # 注意不要引用上述for循环定义的变量。下面变量的命名和使用都要注意。
        next_inp = Variable(rollout.observations[-1], volatile=True)  # 禁止梯度更新
        next_value = actor_critic(next_inp)[0].data  # 获取下一步的value值
        rollout.compute_returns(next_value=next_value,
                                use_gae=False,
                                gamma=args.gamma,
                                tau=None)

        if args.algo.startswith('a2c'):
            # 下面进行A2C算法梯度更新
            inps = Variable(rollout.observations[:-1].view(-1, *obs_shape))
            acts = Variable(rollout.actions.view(-1, action_shape))

            # print("a2cs's acts size is {}".format(acts.size()))
            value, action_log_probs, cls_entropy = actor_critic.evaluate_actions(
                inputs=inps, actions=acts)
            print(cls_entropy.data)
            # print("inputs' shape is {}".format(inps.size()))
            # print("value's shape is {}".format(value.size()))
            value = value.view(args.num_steps, args.workers, 1)
            # print("action_log_probs's shape is {}".format(action_log_probs.size()))
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.workers, 1)
            # 计算loss
            advantages = Variable(rollout.returns[:-1]) - value
            value_loss = advantages.pow(2).mean()  # L2Loss or MSE Loss
            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()
            total_loss = value_loss * args.value_loss_coef + action_loss - cls_entropy * args.entropy_coef

            optimizer.zero_grad()
            total_loss.backward()
            # 下面进行迷之操作。。梯度裁剪(https://www.cnblogs.com/lindaxin/p/7998196.html)
            nn.utils.clip_grad_norm(actor_critic.parameters(),
                                    args.max_grad_norm)
            # average_gradients(actor_critic)
            optimizer.step()
        elif args.algo.startswith('ppo'):
            # 下面进行PPO算法梯度更新
            advantages = rollout.returns[:-1] - rollout.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)
            for e in range(args.ppo_epoch):
                data_generator = rollout.feed_forward_generator(
                    advantages, args.num_mini_batch)

                for sample in data_generator:
                    observations_batch, actions_batch, \
                    return_batch, masks_batch, old_action_log_probs_batch, \
                    adv_targ = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, cls_entropy = actor_critic.evaluate_actions(
                        Variable(observations_batch), Variable(actions_batch))

                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) -
                                  values).pow(2).mean()

        # 事后一支烟
        rollout.after_update()
        update_time = time.time() - update_start
        print("updates {} finished, cost time {}:{}".format(
            updata_i, update_time // 60, update_time % 60))
        # print("total services is {}".format(total_services))
        # 存储模型
        if updata_i % args.save_interval == 0:
            save_path = os.path.join(args.save_dir, 'a2c')
            save_path = os.path.join(save_path, args.cnn)
            save_path = os.path.join(save_path, args.step_over)
            save_path = os.path.join(save_path, args.parameter)
            if os.path.exists(save_path) and os.path.isdir(save_path):
                pass
            else:
                os.makedirs(save_path)
            save_file = os.path.join(save_path, str(updata_i) + '.tar')
            save_content = {
                'update_i': updata_i,
                'state_dict': actor_critic.state_dict(),
                'optimizer': optimizer.state_dict(),
                'mean_reward': final_rewards.mean()
            }
            torch.save(save_content, save_file)

        # 输出日志
        if updata_i % args.log_interval == 0:
            end = time.time()
            interval = end - log_start
            remaining_seconds = (num_updates - updata_i -
                                 1) / args.log_interval * interval
            remaining_hours = int(remaining_seconds // 3600)
            remaining_minutes = int((remaining_seconds % 3600) / 60)
            total_num_steps = (updata_i + 1) * args.workers * args.num_steps
            blocked_services = total_services - allocated_services
            bp = blocked_services / total_services
            wave_port_num, total_port_num = envs.get_all_edges_port()
            wave_occ_sum, resource_utilization_rate = envs.get_resourceUtilization(
            )

            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, \
            entropy {:.5f}, value loss {:.5f}, policy loss {:.8f}, remaining time {}:{}, 阻塞率为{}/{}={}, \
                  各个波长端口数量为{}, 总的端口数量为{}, 带宽占用情况为{}, 资源占用率为{}".format(
                    updata_i, total_num_steps,
                    int(total_num_steps / (end - start)), final_rewards.mean(),
                    final_rewards.median(), final_rewards.min(),
                    final_rewards.max(), cls_entropy.data, value_loss.data,
                    action_loss.data, remaining_hours, remaining_minutes,
                    blocked_services, total_services, bp, wave_port_num,
                    total_port_num, wave_occ_sum, resource_utilization_rate))
            # raise NotImplementedError
            total_services = 0
            allocated_services = 0
            log_start = time.time()

    envs.close()
Exemple #29
0
    def train(self, dict_model, config, training_target):
        self.NUM_AGENTS = len(dict_model)
        # print("train", dict_model)
        # actor_critics = []
        # local_brains = []
        # rollouts = []
        actor_critic = dict_model[training_target]
        global_brain = Brain(actor_critic, config)
        rollout = RolloutStorage(self.NUM_ADVANCED_STEP, self.NUM_PARALLEL,
                                 self.obs_shape, self.device)

        current_obs = torch.zeros(self.NUM_PARALLEL,
                                  self.obs_shape).to(self.device)
        episode_rewards = torch.zeros([self.NUM_PARALLEL, 1])
        final_rewards = torch.zeros([self.NUM_PARALLEL, 1])

        episode = np.zeros(self.NUM_PARALLEL)

        obs = self.envs.reset()
        obs = np.array(obs)
        obs = torch.from_numpy(obs).float()
        current_obs = obs

        rollout.observations[0].copy_(current_obs)

        while True:
            for step in range(self.NUM_ADVANCED_STEP):
                with torch.no_grad():
                    # action = actor_critic.act(rollouts.observations[step]) # ここでアクション決めて
                    action = torch.zeros(self.NUM_PARALLEL,
                                         self.NUM_AGENTS).long().to(
                                             self.device)  # 各観測に対する,各エージェントの行動
                    if DEBUG:
                        print("actionサイズ", self.NUM_PARALLEL, self.NUM_AGENTS)
                    for i, (k, v) in enumerate(dict_model.items()):
                        if k == training_target:
                            tmp_action = v.act(current_obs)
                            target_action = copy.deepcopy(tmp_action)
                        else:
                            tmp_action = v.act_greedy(current_obs)
                        action[:, i] = tmp_action.squeeze()
                if DEBUG: print("step前のここ?", action.shape)
                obs, reward, done, infos = self.envs.step(action)  # これで時間を進める
                episode_rewards += reward

                # if done then clean the history of observation
                masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                           for done_ in done])
                if DEBUG: print("done.shape", done.shape)
                if DEBUG: print("masks.shape", masks.shape)
                if DEBUG: print("obs.shape", obs.shape)
                with open(self.resdir + "/episode_reward.txt", "a") as f:
                    for i, info in enumerate(infos):
                        if 'episode' in info:
                            f.write("{:}\t{:}\t{:}\t{:}\n".format(
                                training_target, episode[i], info['env_id'],
                                info['episode']['r']))
                            print(training_target, episode[i], info['env_id'],
                                  info['episode']['r'])
                            episode[i] += 1

                final_rewards *= masks
                final_rewards += (1 - masks) * episode_rewards

                episode_rewards *= masks
                current_obs *= masks

                current_obs = obs  # ここで観測を更新している

                rollout.insert(current_obs, target_action.data, reward, masks,
                               self.NUM_ADVANCED_STEP)
                with open(self.resdir + "/reward_log.txt",
                          "a") as f:  # このログはエピソードが終わったときだけでいい->要修正
                    f.write(
                        "{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\n".
                        format(self.loop_i, training_target, episode.mean(),
                               step,
                               reward.max().numpy(),
                               reward.min().numpy(),
                               reward.mean().numpy(),
                               episode_rewards.max().numpy(),
                               episode_rewards.min().numpy(),
                               episode_rewards.mean().numpy()))
                    print(self.loop_i, training_target, episode.mean(), step,
                          reward.mean().numpy(),
                          episode_rewards.mean().numpy())

            with torch.no_grad():
                next_value = actor_critic.get_value(
                    rollout.observations[-1]).detach()

            rollout.compute_returns(next_value, self.gamma)
            value_loss, action_loss, total_loss, entropy = global_brain.update(
                rollout)

            with open(self.resdir + "/loss_log.txt", "a") as f:
                f.write("{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\n".format(
                    self.loop_i, training_target, episode.mean(), value_loss,
                    action_loss, entropy, total_loss))
                print(
                    "value_loss {:.4f}\taction_loss {:.4f}\tentropy {:.4f}\ttotal_loss {:.4f}"
                    .format(value_loss, action_loss, entropy, total_loss))

            rollout.after_update()

            if int(episode.mean()) + 1 > self.NUM_EPISODES:
                # print("ループ抜ける")
                break
        # ここでベストなモデルを保存していた(備忘)
        print("%s番目のエージェントのtrain終了" % training_target)
        dict_model[training_target] = actor_critic  # {}
        return dict_model
def main():
    torch.set_num_threads(1)
    device = torch.device("cpu")

    # if args.vis:
    #     from visdom import Visdom
    #     viz = Visdom(port=args.port)
    #     win = None

    # envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
    #                     args.gamma, args.log_dir, args.add_timestep, device, False)

    observation_space = Box(low=0, high=10000, shape=(19,), dtype=np.float32)  # Box(84,84,4)
    action_space = Discrete(7)  # Discrete(4)

    actor_critic = Policy(observation_space.shape, action_space, base_kwargs={'recurrent': None})
    actor_critic.to(device)

    # if args.algo == 'a2c':
    #     agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
    #                            args.entropy_coef, lr=args.lr,
    #                            eps=args.eps, alpha=args.alpha,
    #                            max_grad_norm=args.max_grad_norm)
    # elif args.algo == 'ppo':
    #     agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
    #                      args.value_loss_coef, args.entropy_coef, lr=args.lr,
    #                            eps=args.eps,
    #                            max_grad_norm=args.max_grad_norm)
    # elif args.algo == 'acktr':
    agent = algo.A2C_ACKTR(actor_critic, value_loss_coef=0.1,
                           entropy_coef=0.01, acktr=True)

    rollouts = RolloutStorage(8000, 1, observation_space.shape, action_space, actor_critic.recurrent_hidden_state_size)

    obs = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    rollouts.obs[0].copy_(torch.Tensor(obs))
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    f = open('poktr_20_origin_2.txt', 'a')
    f.write("\noriginal loss(schedule 6 packets):")
    start = time.time()
    for j in range(num_updates):  # num_updates
        net = Net()
        node_list, path_list = net.read_graph(net.node_list, net.path_list)
        startnode = node_list[0]  # 起始节点
        net.get_data(startnode)
        count = 0
        remove_count = 0  # 记录丢弃的数据包的值
        end_time = startnode.messages[0].end_time
        s = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, end_time]
        states = [[0], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]  # 用来存储所有节点状态
        ep_r = 0
        ep_acc_r = 0
        obs[:] = s
        reward_ten = torch.Tensor(1, 1)
        for step in range(8000):
            # Sample actions
            count += 1
            old_action_log_prob = torch.Tensor([[0]])
            # print(rollouts, rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step])
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                        rollouts.obs[step],
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step])
                action_item = action.item()  # 将Tensor类型的数据转化为Int型

            # Obser reward and next obs
            obs, reward, done, states, remove_count, acc_r, su_packets = net.schedule(action_item, count, states, node_list, path_list,
                                                                            remove_count)

            ep_r += reward
            ep_acc_r += acc_r
            reward_ten[[0]] = reward
            # for info in infos:
            #     if 'episode' in info.keys():
            #         episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done else [1.0]])
            # print((obs), recurrent_hidden_states, torch.Tensor(action), type(action_log_prob), type(value), type(reward), type(masks))
            rollouts.insert(torch.Tensor(obs), recurrent_hidden_states, action, action_log_prob, value, reward_ten, masks)
            old_action_log_prob = action_log_prob
            # print(action_log_prob, action_log_prob.shape)

        f.write("\ntime:"+str(time.strftime('%H:%M:%S', time.localtime(time.time())))+"|"+str(j)+"|ep_r:"+str(ep_r)+"|pakcets:"+str(su_packets)+"|remove:"+str(remove_count)+"|ep_acc_r:"+str(ep_acc_r / 8000))

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.obs[-1],
                                                rollouts.recurrent_hidden_states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, False, 0.99, 0.95)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        print("time:", time.strftime('%H:%M:%S', time.localtime(time.time())), "|", j, "|ep_r:", ep_r, "|pakcets:",
              su_packets, "|remove:", remove_count, "|ep_acc_r:", ep_acc_r / 8000, "|value_loss:", value_loss,
              "|action_loss:", action_loss, "|entropy:", dist_entropy)
        rollouts.after_update()

        # if j % 100 == 0:
        #     save_path = os.path.join('./trained_models/', 'acktr')
        #     try:
        #         os.makedirs(save_path)
        #     except OSError:
        #         pass

            # A really ugly way to save a model to CPU
            # save_model = actor_critic
            # if args.cuda:
            #     save_model = copy.deepcopy(actor_critic).cpu()

            # save_model = [save_model,
            #               getattr(get_vec_normalize(envs), 'ob_rms', None)]

            # torch.save(save_model, os.path.join(save_path, "acktr" + ".pt"))

        total_num_steps = (j + 1) * num_processes * num_steps