Esempio n. 1
0
    def __init__(self, envs, hparams):

        self.use_gae = hparams['use_gae']
        self.gamma = hparams['gamma']
        self.tau = hparams['tau']

        self.obs_shape = hparams['obs_shape']
        self.num_steps = hparams['num_steps']
        self.num_processes = hparams['num_processes']
        self.value_loss_coef = hparams['value_loss_coef']
        self.entropy_coef = hparams['entropy_coef']
        self.cuda = hparams['cuda']
        self.ppo_epoch = hparams['ppo_epoch']
        self.batch_size = hparams['batch_size']
        self.clip_param = hparams['clip_param']



        if len(envs.observation_space.shape) == 3:
            actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space)
        else:
            actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space)

        if envs.action_space.__class__.__name__ == "Discrete":
            action_shape = 1
        else:
            action_shape = envs.action_space.shape[0]
        self.action_shape = action_shape

        rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space)
        #it has a self.state that is [steps, processes, obs]
        #steps is used to compute expected reward

        if self.cuda:
            actor_critic.cuda()
            rollouts.cuda()

        self.eps = hparams['eps']

        # self.optimizer = optim.RMSprop(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha'])
        self.optimizer = optim.Adam(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'])

        # if hparams['lr_schedule'] == 'linear':
        self.init_lr = hparams['lr']
        self.final_lr = hparams['final_lr']
        #     lr_func = lambda epoch: max( init_lr*(1.-(epoch/500.)), final_lr)
        #     self.optimizer2 = lr_scheduler.LambdaLR(self.optimizer, lr_lambda=lr_func)
        # self.current_lr = hparams['lr']


        self.actor_critic = actor_critic
        self.rollouts = rollouts


        self.old_model = copy.deepcopy(self.actor_critic)
	def select_network(self):
		if len(self.envs.observation_space.shape) == 3:
			actor_critic = CNNPolicy(self.obs_shape[0], self.envs.action_space, 
				self.args.recurrent_policy)
		else:
			assert not self.args.recurrent_policy, \
				"Recurrent policy is not implemented for the MLP controller"
			actor_critic = MLPPolicy(self.obs_shape[0], self.envs.action_space)
			#actor_critic = BPW_MLPPolicy(obs_shape[0], self.envs.action_space)     
		return actor_critic
Esempio n. 3
0
    def __init__(self, envs, hparams):

        self.use_gae = hparams['use_gae']
        self.gamma = hparams['gamma']
        self.tau = hparams['tau']

        self.obs_shape = hparams['obs_shape']
        self.num_steps = hparams['num_steps']
        self.num_processes = hparams['num_processes']
        self.value_loss_coef = hparams['value_loss_coef']
        self.entropy_coef = hparams['entropy_coef']
        self.cuda = hparams['cuda']
        self.ppo_epoch = hparams['ppo_epoch']
        self.batch_size = hparams['batch_size']
        self.clip_param = hparams['clip_param']



        if len(envs.observation_space.shape) == 3:
            actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space)
        else:
            actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space)

        if envs.action_space.__class__.__name__ == "Discrete":
            action_shape = 1
        else:
            action_shape = envs.action_space.shape[0]
        self.action_shape = action_shape

        rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space)
        #it has a self.state that is [steps, processes, obs]
        #steps is used to compute expected reward

        if self.cuda:
            actor_critic.cuda()
            rollouts.cuda()

        self.eps = hparams['eps']

        # self.optimizer = optim.RMSprop(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha'])
        self.optimizer = optim.Adam(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'])

        # if hparams['lr_schedule'] == 'linear':
        self.init_lr = hparams['lr']
        self.final_lr = hparams['final_lr']
        #     lr_func = lambda epoch: max( init_lr*(1.-(epoch/500.)), final_lr)
        #     self.optimizer2 = lr_scheduler.LambdaLR(self.optimizer, lr_lambda=lr_func)
        # self.current_lr = hparams['lr']


        self.actor_critic = actor_critic
        self.rollouts = rollouts


        self.old_model = copy.deepcopy(self.actor_critic)
Esempio n. 4
0
    def __init__(self, envs, hparams):

        self.use_gae = hparams['use_gae']
        self.gamma = hparams['gamma']
        self.tau = hparams['tau']

        self.obs_shape = hparams['obs_shape']
        self.num_steps = hparams['num_steps']
        self.num_processes = hparams['num_processes']
        self.value_loss_coef = hparams['value_loss_coef']
        self.entropy_coef = hparams['entropy_coef']
        self.cuda = hparams['cuda']
        self.opt = hparams['opt']
        self.batch_size = hparams['batch_size']
        self.a2c_epochs = hparams['a2c_epochs']







        if len(envs.observation_space.shape) == 3:
            actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space)
        else:
            actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space)

        if envs.action_space.__class__.__name__ == "Discrete":
            action_shape = 1
        else:
            action_shape = envs.action_space.shape[0]
        self.action_shape = action_shape

        rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space)
        #it has a self.state that is [steps, processes, obs]
        #steps is used to compute expected reward

        if self.cuda:
            actor_critic.cuda()
            rollouts.cuda()

        if self.opt == 'rms':
            self.optimizer = optim.RMSprop(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha'])
        elif self.opt == 'adam':
            self.optimizer = optim.Adam(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'])
        else:
            print ('no opt specified')

        self.actor_critic = actor_critic
        self.rollouts = rollouts
Esempio n. 5
0
    def __init__(self, envs, hparams):

        self.use_gae = hparams['use_gae']
        self.gamma = hparams['gamma']
        self.tau = hparams['tau']

        self.obs_shape = hparams['obs_shape']
        self.num_steps = hparams['num_steps']
        self.num_processes = hparams['num_processes']
        self.value_loss_coef = hparams['value_loss_coef']
        self.entropy_coef = hparams['entropy_coef']
        self.cuda = hparams['cuda']
        self.opt = hparams['opt']
        self.batch_size = hparams['batch_size']
        self.a2c_epochs = hparams['a2c_epochs']







        if len(envs.observation_space.shape) == 3:
            actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space)
        else:
            actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space)

        if envs.action_space.__class__.__name__ == "Discrete":
            action_shape = 1
        else:
            action_shape = envs.action_space.shape[0]
        self.action_shape = action_shape

        rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space)
        #it has a self.state that is [steps, processes, obs]
        #steps is used to compute expected reward

        if self.cuda:
            actor_critic.cuda()
            rollouts.cuda()

        if self.opt == 'rms':
            self.optimizer = optim.RMSprop(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha'])
        elif self.opt == 'adam':
            self.optimizer = optim.Adam(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'])
        else:
            print ('no opt specified')

        self.actor_critic = actor_critic
        self.rollouts = rollouts
Esempio n. 6
0
def run_maml(args):
    config = utils.load_config(args.config)
    train_params = config["train_params"]

    # open and close env just to get right action and obs space
    env = sonic_utils.make_from_config(config['env_params'], True)
    env.close()

    # init model
    model = CNNPolicy(
        env.observation_space, env.action_space, train_params["vf_coef"],
        train_params["ent_coef"], train_params["lr_meta"], train_params["max_grad_norm"]

    )

    workers = find_workers("worker")
    init_workers(workers, config, model.get_weights())

    # start run
    workers_results = {w: Pyro4.Future(w.run)() for w in workers}

    savedir = utils.prepare_exp_dir(config, args.exp_name)

    updates = 0
    while True:
        # first zero all grads
        model.optimizer.zero_grad()

        # then apply add grads from remote workers
        wait_run_end(workers_results, model)

        # apply gradient
        model.optimizer.step()

        updates += 1

        # save last weights
        if config['log']['save_last']:
            fpath = savedir / 'last.pt'
            model.save(fpath)

        # save on save period
        if updates % config['log']["save_interval"] == 0 or updates == 1:
            fpath = savedir / '{}.pt'.format(updates)
            model.save(fpath)
Esempio n. 7
0
    def __init__(self, envs, cuda, num_steps, num_processes, obs_shape, lr,
                 eps, alpha, use_gae, gamma, tau, value_loss_coef,
                 entropy_coef):

        if len(envs.observation_space.shape) == 3:
            actor_critic = CNNPolicy(obs_shape[0], envs.action_space)
        else:
            actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

        if envs.action_space.__class__.__name__ == "Discrete":
            action_shape = 1
        else:
            action_shape = envs.action_space.shape[0]

        if cuda:
            actor_critic.cuda()

        # if args.algo == 'a2c':
        self.optimizer = optim.RMSprop(actor_critic.parameters(), lr, eps,
                                       alpha)

        rollouts = RolloutStorage(num_steps, num_processes, obs_shape,
                                  envs.action_space)
        #it has a self.state that is [steps, processes, obs]
        #steps is used to compute expected reward

        if cuda:
            rollouts.cuda()

        self.actor_critic = actor_critic
        self.rollouts = rollouts

        self.use_gae = use_gae
        self.gamma = gamma
        self.tau = tau

        self.obs_shape = obs_shape
        self.action_shape = action_shape
        self.num_steps = num_steps
        self.num_processes = num_processes
        self.value_loss_coef = value_loss_coef
        self.entropy_coef = entropy_coef
Esempio n. 8
0
def main():
    print("#######")
    print("WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards")
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'






    print (args.cuda)
    print (args.num_steps)
    print (args.num_processes)
    print (args.lr)
    print (args.eps)
    print (args.alpha)
    print (args.use_gae)
    print (args.gamma)
    print (args.tau)
    print (args.value_loss_coef)
    print (args.entropy_coef)
    fdsafasd

    # if args.vis:
    #     from visdom import Visdom
    #     viz = Visdom()
    #     win = None





    envs = SubprocVecEnv([
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ])

    # print('here3')
    # fdasf





    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space)
    else:
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha)
    # elif args.algo == 'ppo':
    #     optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps)
    # elif args.algo == 'acktr':
    #     optimizer = KFACOptimizer(actor_critic)








    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space)
    #it has a self.state that is [steps, processes, obs]
    #steps is used to compute expected reward




    current_state = torch.zeros(args.num_processes, *obs_shape)
    def update_current_state(state):
        shape_dim0 = envs.observation_space.shape[0]
        state = torch.from_numpy(state).float()
        if args.num_stack > 1:
            current_state[:, :-shape_dim0] = current_state[:, shape_dim0:]
        current_state[:, -shape_dim0:] = state

    state = envs.reset()
    update_current_state(state)








    rollouts.states[0].copy_(current_state)
    #set the first state to current state






    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_state = current_state.cuda()
        rollouts.cuda()

    # if args.algo == 'ppo':
    #     old_model = copy.deepcopy(actor_critic)

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):



            # Sample actions
            value, action = actor_critic.act(Variable(rollouts.states[step], volatile=True))
            # make prediction using state that you put into rollouts


            cpu_actions = action.data.squeeze(1).cpu().numpy()
            # Obser reward and next state
            state, reward, done, info = envs.step(cpu_actions)
            # print (state.shape) # [nProcesss, ndims, height, width]
            # fsdf
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            # these final rewards are only used for printing. but the mask is used in the storage, dont know why yet
            # oh its just clearing the env that finished, and resetting its episode_reward
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #if an env is done
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_state.dim() == 4:
                current_state *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_state *= masks

            update_current_state(state)




            rollouts.insert(step, current_state, action.data, value.data, reward, masks)
            # insert all that info into current step
            # not exactly why













        next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data
        # use last state to make prediction of next value









        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.states[:-1].view(-1, *obs_shape))
        #not sure what this is








        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)
        # this computes R =  r + r+ ...+ V(t)  for each step









        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(Variable(rollouts.states[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape)))
            # I think this aciton log prob could have been computed and stored earlier 
            # and didnt we already store the value prediction???






            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) * action_log_probs).mean()

            # if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
            #     # Sampled fisher, see Martens 2014
            #     actor_critic.zero_grad()
            #     pg_fisher_loss = -action_log_probs.mean()

            #     value_noise = Variable(torch.randn(values.size()))
            #     if args.cuda:
            #         value_noise = value_noise.cuda()

            #     sample_values = values + value_noise
            #     vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean()

            #     fisher_loss = pg_fisher_loss + vf_fisher_loss
            #     optimizer.acc_stats = True
            #     fisher_loss.backward(retain_graph=True)
            #     optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward()

            # if args.algo == 'a2c':
            #     nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

            optimizer.step()
        # elif args.algo == 'ppo':
        #     advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
        #     advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)

        #     old_model.load_state_dict(actor_critic.state_dict())
        #     if hasattr(actor_critic, 'obs_filter'):
        #         old_model.obs_filter = actor_critic.obs_filter

        #     for _ in range(args.ppo_epoch):
        #         sampler = BatchSampler(SubsetRandomSampler(range(args.num_processes * args.num_steps)), args.batch_size * args.num_processes, drop_last=False)
        #         for indices in sampler:
        #             indices = torch.LongTensor(indices)
        #             if args.cuda:
        #                 indices = indices.cuda()
        #             states_batch = rollouts.states[:-1].view(-1, *obs_shape)[indices]
        #             actions_batch = rollouts.actions.view(-1, action_shape)[indices]
        #             return_batch = rollouts.returns[:-1].view(-1, 1)[indices]

        #             # Reshape to do in a single forward pass for all steps
        #             values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(Variable(states_batch), Variable(actions_batch))

        #             _, old_action_log_probs, _ = old_model.evaluate_actions(Variable(states_batch, volatile=True), Variable(actions_batch, volatile=True))

        #             ratio = torch.exp(action_log_probs - Variable(old_action_log_probs.data))
        #             adv_targ = Variable(advantages.view(-1, 1)[indices])
        #             surr1 = ratio * adv_targ
        #             surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ
        #             action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP)

        #             value_loss = (Variable(return_batch) - values).pow(2).mean()

        #             optimizer.zero_grad()
        #             (value_loss + action_loss - dist_entropy * args.entropy_coef).backward()
        #             optimizer.step()




        rollouts.states[0].copy_(rollouts.states[-1])
        # the first state is now the last state of the previous 









        # if j % args.save_interval == 0 and args.save_dir != "":
        #     save_path = os.path.join(args.save_dir, args.algo)
        #     try:
        #         os.makedirs(save_path)
        #     except OSError:
        #         pass

        #     # A really ugly way to save a model to CPU
        #     save_model = actor_critic
        #     if args.cuda:
        #         save_model = copy.deepcopy(actor_critic).cpu()
        #     torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(j, total_num_steps,
                       int(total_num_steps / (end - start)),
                       final_rewards.mean(),
                       final_rewards.median(),
                       final_rewards.min(),
                       final_rewards.max(), -dist_entropy.data[0],
                       value_loss.data[0], action_loss.data[0]))
Esempio n. 9
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")
    print(args)

    try:
        os.makedirs(args.log_dir)
    except OSError:
        files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv'))
        for f in files:
            os.remove(f)

    torch.cuda.manual_seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)

    for gamma in args.gamma:
        with open(args.log_dir + '/MSE_' + str(gamma) + '_monitor.csv',
                  "wt") as monitor_file:
            monitor = csv.writer(monitor_file)
            monitor.writerow([
                'update', 'error',
                str(int(args.num_frames) // args.num_steps)
            ])

    os.environ['OMP_NUM_THREADS'] = '1'

    print("Using env {}".format(args.env_name))

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    num_heads = len(
        args.gamma) if not args.reward_predictor else len(args.gamma) - 1

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0],
                                 envs.action_space,
                                 num_heads=num_heads,
                                 hidden_size=args.hidden_size)
    else:
        actor_critic = MLPPolicy(obs_shape[0],
                                 envs.action_space,
                                 num_heads=num_heads,
                                 reward_predictor=args.reward_predictor,
                                 use_s=args.use_s,
                                 use_s_a=args.use_s_a,
                                 use_s_a_sprime=args.use_s_a_sprime)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    lrs = [args.lr] * len(actor_critic.param_groups)

    if not args.reward_predictor:
        assert len(actor_critic.param_groups) == len(lrs)
        model_params = [{
            'params': model_p,
            'lr': args.lr
        } for model_p, lr in zip(actor_critic.param_groups, lrs)]
    else:
        model_params = [{
            'params': model_p,
            'lr': p_lr
        } for model_p, p_lr in zip(actor_critic.param_groups[:-1], lrs)]
        model_params.append({
            'params': actor_critic.param_groups[-1],
            'lr': args.lr_rp
        })

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(model_params,
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)

    elif args.algo == 'ppo':
        optimizer = optim.Adam(model_params, args.lr, eps=args.eps)

    rollouts = RolloutStorage(args.num_steps,
                              args.num_processes,
                              obs_shape,
                              envs.action_space,
                              actor_critic.state_size,
                              gamma=args.gamma,
                              use_rp=args.reward_predictor)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs, obs_tensor):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            obs_tensor[:, :-shape_dim0] = obs_tensor[:, shape_dim0:]
        obs_tensor[:, -shape_dim0:] = obs

    obs = envs.reset()

    update_current_obs(obs, current_obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    advantages_list = []

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            cpu_actions = add_gaussian_noise(cpu_actions, args.action_noise)

            # Obser reward and next obs
            obs, raw_reward, done, info = envs.step(cpu_actions)

            reward = np.copy(raw_reward)

            reward = add_gaussian_noise(reward, args.reward_noise)

            reward = epsilon_greedy(reward, args.reward_epsilon,
                                    args.reward_high, args.reward_low)

            raw_reward = torch.from_numpy(
                np.expand_dims(np.stack(raw_reward), 1)).float()

            episode_rewards += raw_reward

            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs, current_obs)

            if args.reward_predictor:
                r_hat = actor_critic.predict_reward(
                    Variable(rollouts.observations[step], volatile=True),
                    action, Variable(current_obs, volatile=True))
                p_hat = min(args.rp_burn_in, j) / args.rp_burn_in
                estimate_reward = (1 -
                                   p_hat) * reward + p_hat * r_hat.data.cpu()
                reward = torch.cat([reward, estimate_reward], dim=-1)
                value = torch.cat([r_hat, value], dim=-1).data
            else:
                value = value.data

            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, value, reward, masks,
                            raw_reward)

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))[0].data

        if args.reward_predictor:
            if args.use_s or args.use_s_a:
                r_hat = actor_critic.predict_reward(
                    Variable(rollouts.observations[-1], volatile=True),
                    Variable(rollouts.actions[-1], volatile=True), None).data
                next_value = torch.cat([r_hat, next_value], dim=-1)
            else:
                next_value = torch.cat([
                    torch.zeros(list(next_value.size())[:-1] + [1]), next_value
                ],
                                       dim=-1)

        rollouts.compute_returns(next_value, args.use_gae, args.tau)

        if args.algo in ['a2c']:

            batch_states = Variable(rollouts.states[0].view(
                -1, actor_critic.state_size))
            batch_masks = Variable(rollouts.masks[:-1].view(-1, 1))
            batch_obs = Variable(rollouts.observations[:-1].view(
                -1, *obs_shape))
            batch_actions = Variable(rollouts.actions.view(-1, action_shape))

            values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                batch_obs, batch_states, batch_masks, batch_actions)
            if args.reward_predictor:
                batch_obs_prime = Variable(rollouts.observations[1:].view(
                    -1, *obs_shape))
                values = torch.cat([
                    actor_critic.predict_reward(batch_obs, batch_actions,
                                                batch_obs_prime), values
                ],
                                   dim=-1)
            returns_as_variable = Variable(rollouts.returns[:-1])

            batched_v_loss = 0

            values = values.view(returns_as_variable.size())
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            advantages = returns_as_variable - values
            value_loss = advantages.pow(2).sum(-1).mean()
            action_loss = -(Variable(advantages[:, :, -1].unsqueeze(-1).data) *
                            action_log_probs).mean()
            if args.reward_predictor:
                rp_error = (values[:, :, 0].data -
                            rollouts.raw_rewards).pow(2).mean()
                advantages_list.append([
                    rp_error,
                    advantages[:, :, -1].pow(2).mean().data.cpu().numpy()[0]
                ])
            else:
                advantages_list.append(
                    advantages[:, :, -1].pow(2).mean().data.cpu().numpy()[0])

            optimizer.zero_grad()
            (batched_v_loss + value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)

            optimizer.step()
        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]

            advantages = advantages[:, :, -1]

            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)

            for e in range(args.ppo_epoch):
                data_generator = rollouts.feed_forward_generator(
                    advantages, args.num_mini_batch)

                for sample in data_generator:
                    observations_batch, states_batch, actions_batch, \
                       return_batch, masks_batch, old_action_log_probs_batch, \
                            adv_targ, observations_batch_prime, true_rewards_batch, \
                            noisy_observations_batch, true_observations_batch = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                        Variable(observations_batch), Variable(states_batch),
                        Variable(masks_batch), Variable(actions_batch))

                    if args.reward_predictor:
                        values = torch.cat([
                            actor_critic.predict_reward(
                                Variable(observations_batch),
                                Variable(actions_batch),
                                Variable(observations_batch_prime)), values
                        ],
                                           dim=-1)

                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)

                    td = (Variable(return_batch) - values).pow(2)

                    value_loss = td.sum(-1).mean()

                    if args.reward_predictor:
                        rp_error = (values[:, 0].data -
                                    true_rewards_batch).pow(2).mean()
                        advantages_list.append(
                            [rp_error, td[:, -1].mean(0).data.cpu().numpy()])
                    else:
                        advantages_list.append(
                            td[:, -1].mean(0).data.cpu().numpy())

                    optimizer.zero_grad()
                    (value_loss + action_loss -
                     dist_entropy * args.entropy_coef).backward()

                    nn.utils.clip_grad_norm(actor_critic.parameters(),
                                            args.max_grad_norm)
                    optimizer.step()

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, "
                "entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".format(
                    j, total_num_steps, int(total_num_steps / (end - start)),
                    final_rewards.mean(), final_rewards.median(),
                    final_rewards.min(), final_rewards.max(),
                    dist_entropy.data[0], value_loss.data[0],
                    action_loss.data[0]))

            if len(advantages_list) > 2:
                advantages_array = np.array(advantages_list).reshape(
                    -1, len(args.gamma)).T
                for g, gamma in enumerate(args.gamma):
                    with open(
                            args.log_dir + '/MSE_' + str(gamma) +
                            '_monitor.csv', "a") as monitor_file:
                        monitor = csv.writer(monitor_file)
                        monitor.writerow(
                            [total_num_steps,
                             np.mean(advantages_array[g])])

            advantages_list = []
Esempio n. 10
0
def train(config, exp_name='test'):

    train_params = config['train_params']
    env_params = config['env_params']
    log_params = config["log"]

    savedir = None
    if log_params["log_dir"] is not None:
        savedir = utils.prepare_exp_dir(config, exp_name)

    env = sonic_utils.make_from_config(env_params)

    model = CNNPolicy(env.observation_space, env.action_space,
                      train_params["vf_coef"], train_params["ent_coef"],
                      train_params["lr"], train_params["max_grad_norm"])

    if train_params["weights"] is not None:
        model.load(train_params["weights"], train_params["load_adam_params"])

    seg_gen = traj_segment_generator(model,
                                     env,
                                     train_params['n_steps'],
                                     sample=True)

    total_steps = 0
    updates = 0
    t0 = time()
    epinfobuf = deque(maxlen=train_params["ep_info_len"])
    seg_inds = np.arange(train_params['n_steps'])
    n_batches = train_params["n_steps"] // train_params["batch_size"]
    loss_vals = []
    while True:
        if total_steps > train_params['max_steps']:
            break

        # get batch
        seg = seg_gen.__next__()
        add_vtarg(seg, train_params['gamma'], train_params['lam'])

        # add episode info
        epinfobuf.extend(seg['ep_infos'])

        for _ in range(train_params["n_opt_epochs"]):
            np.random.shuffle(seg_inds)
            for i in range(n_batches):
                start = i * train_params["batch_size"]
                end = (i + 1) * train_params["batch_size"]
                inds = seg_inds[start:end]

                losses = model.train(train_params['cliprange'],
                                     seg['ob'][inds], seg['tdlamret'][inds],
                                     seg['ac'][inds], seg['vpred'][inds],
                                     seg["ac_logits"][inds])
                loss_vals.append([l.detach().numpy() for l in losses])

        total_steps += train_params['n_steps']
        updates += 1

        if log_params["log"] and (updates % log_params["log_interval"] == 0
                                  or updates == 1):

            tnow = time()
            fps = int(total_steps / (tnow - t0))
            # ev = explained_variance(values, returns)
            logger.logkv("total_steps", total_steps)
            logger.logkv("nupdates", updates)
            logger.logkv("fps", fps)
            logger.logkv(
                'eprewmean',
                np.mean([epinfo['r'] for epinfo in epinfobuf
                         if 'r' in epinfo]))
            logger.logkv(
                'eprewmean_exp',
                np.mean([
                    epinfo['r_exp'] for epinfo in epinfobuf
                    if 'r_exp' in epinfo
                ]))
            logger.logkv(
                'eplenmean',
                np.mean([epinfo['l'] for epinfo in epinfobuf
                         if 'l' in epinfo]))
            logger.logkv('time_elapsed', tnow - t0)

            for loss_val, loss_name in zip(np.mean(loss_vals, axis=0),
                                           model.loss_names):
                logger.logkv(loss_name, loss_val)
            logger.dumpkvs()

            del loss_vals[:]

        # save last weights
        if log_params['save_last'] and savedir is not None:
            fpath = savedir / 'last.pt'
            model.save(fpath)

        # save on save period
        if (updates % log_params["save_interval"] == 0
                or updates == 1) and savedir is not None:
            fpath = savedir / '{}.pt'.format(updates)
            model.save(fpath)

    return epinfobuf
Esempio n. 11
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monit`or (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    # logger = Logger(algorithm_name = args.algo, environment_name = args.env_name, folder = args.folder)
    # logger.save_args(args)

    # print ("---------------------------------------")
    # print ('Saving to', logger.save_folder)
    # print ("---------------------------------------")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space,
                                 args.recurrent_policy)
        target_actor_critic = CNNPolicy(obs_shape[0], envs.action_space,
                                        args.recurrent_policy)

    else:
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)
        target_actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    for param, target_param in zip(actor_critic.parameters(),
                                   target_actor_critic.parameters()):
        target_param.data.copy_(param.data)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    actor_regularizer_criterion = nn.KLDivLoss()
    optimizer = optim.RMSprop(actor_critic.parameters(),
                              args.lr,
                              eps=args.eps,
                              alpha=args.alpha)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, value.data, reward, masks)

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
            Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
            Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
            Variable(rollouts.masks[:-1].view(-1, 1)),
            Variable(rollouts.actions.view(-1, action_shape)))
        """
        Used for KL Constraint in case of Continuous Action Stochastic Policies
        """
        # target_values, target_action_log_probs, target_dist_entropy, target_states, target_action_mean, target_action_std = target_actor_critic.evaluate_actions_mean_and_std(Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
        #                                                                                Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
        #                                                                                Variable(rollouts.masks[:-1].view(-1, 1)),
        #                                                                                Variable(rollouts.actions.view(-1, action_shape)))

        # actor_regularizer_loss = (torch.log(action_std/target_action_std) + (action_std.pow(2) + (action_mean - target_action_mean).pow(2))/(2*target_action_std.pow(2)) - 0.5)

        values = values.view(args.num_steps, args.num_processes, 1)
        action_log_probs = action_log_probs.view(args.num_steps,
                                                 args.num_processes, 1)

        advantages = Variable(rollouts.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        ### Loss with regularizer added
        ##action_loss = -(Variable(advantages.data) * action_log_probs).mean() + args.actor_lambda * actor_regularizer_loss.mean(0).sum()

        action_loss = -(Variable(advantages.data) * action_log_probs).mean()

        optimizer.zero_grad()
        total_loss = value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef
        total_loss.backward()

        nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)
        optimizer.step()

        ## Exponential average for target updates
        #if (j%args.target_update_interval == 0):
        # for param, target_param in zip(actor_critic.parameters(), target_actor_critic.parameters()):
        #     target_param.data.copy_(args.target_tau * param.data + (1 - args.target_tau) * target_param.data)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))

            final_rewards_mean = [final_rewards.mean()]
            final_rewards_median = [final_rewards.median()]
            final_rewards_min = [final_rewards.min()]
            final_rewards_max = [final_rewards.max()]

            # logger.record_data(final_rewards_mean, final_rewards_median, final_rewards_min, final_rewards_max)
            # logger.save()

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
            except IOError:
                pass
Esempio n. 12
0
def main():
    envs = [make_env(env_name, seed, rank, log_dir) for rank in range(num_processes)]
    envs = SubprocVecEnv(envs)
    obs_shape = envs.observation_space.shape
    obs_shape = [obs_shape[0]*num_stack, *obs_shape[1:]]
    actor_critic = CNNPolicy(obs_shape[0], envs.action_space, False)
    if cuda:
        actor_critic.cuda()
    optimizer = optim.RMSprop(actor_critic.parameters(), lr, eps=eps, alpha=alpha)

    rollouts = RolloutStorage(num_steps, num_processes, obs_shape, envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(num_processes, *obs_shape)
    
    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
            current_obs[:, -shape_dim0:] = obs
            
            obs = envs.reset()
            
    update_current_obs(obs)
    rollouts.observations[0].copy_(current_obs)
    episode_rewards = torch.zeros([num_processes,1])
    final_rewards = torch.zeros([num_processes,1])
    if cuda:
        rollouts.cuda()
        current_obs = current_obs.cuda()
    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]
        
        # test
    start = time.time()
    for j in range(num_updates):
        for step in range(num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(Variable(rollouts.observations[step], volatile=True),
                                                                  Variable(rollouts.states[step], volatile=True),
                                                                  Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze().cpu().numpy()
            #print(cpu_action)

            # obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            # stack: make sure that reward is a numpy array(convert list to ndarray)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks
            if cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            # update obs nad rollouts
            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks)

        # compute current update's return
        next_value = actor_critic(Variable(rollouts.observations[-1], volatile=True),
                                  Variable(rollouts.states[-1], volatile=True),
                                  Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, False, gamma, tau)

        # in a2c the values  were calculated twice
        # the data in rollouts must be viewed, because the shape in rollouts is [num_steps, num_processes, x] which is [num,x] in actor_critic
        values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(rollouts.observations[:-1].view(-1, *obs_shape)),                                                                                       Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                                                                                 Variable(rollouts.masks[:-1].view(-1, 1)),
                                                                                       Variable(rollouts.actions.view(-1, action_shape)))

        # compute the loss
        values = values.view(num_steps, num_processes, 1)
        action_log_probs = action_log_probs.view(num_steps, num_processes, 1)

        advantages = Variable(rollouts.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(Variable(advantages.data) * action_log_probs).mean()

        # update model
        optimizer.zero_grad()
        loss = value_loss * value_loss_coef + action_loss - dist_entropy * entropy_coef 
        loss.backward()
        nn.utils.clip_grad_norm(actor_critic.parameters(), max_grad_norm)
        optimizer.step()

        rollouts.after_update()
        if j % log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * num_processes * num_steps
            print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
            format(j, total_num_steps,
                    int(total_num_steps / (end - start)),
                    final_rewards.mean(),
                    final_rewards.median(),
                    final_rewards.min(),
                    final_rewards.max(), dist_entropy.data[0],
                    value_loss.data[0], action_loss.data[0]))
# todo: test save_url                
    torch.save(actor_critic,save_url)      
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    if args.run_index is not None:
        load_params(args)

    try:
        os.makedirs(args.log_dir)
    except OSError:
        files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv'))
        for f in files:
            os.remove(f)

    torch.cuda.manual_seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)

    os.environ['OMP_NUM_THREADS'] = '1'

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    num_heads = 1 if args.reward_predictor else len(args.gamma)
    assert len(envs.observation_space.shape) == 3
    actor_critic = CNNPolicy(obs_shape[0],
                             envs.action_space,
                             use_rp=args.reward_predictor,
                             num_heads=num_heads)

    assert envs.action_space.__class__.__name__ == "Discrete"
    action_shape = 1

    if args.cuda:
        actor_critic.cuda()

    if not args.reward_predictor:
        model_params = actor_critic.parameters()
    else:
        lrs = [args.lr_rp, args.lr]
        model_params = [{
            'params': model_p,
            'lr': p_lr
        } for model_p, p_lr in zip(actor_critic.param_groups, lrs)]

    optimizer = optim.RMSprop(model_params,
                              args.lr,
                              eps=args.eps,
                              alpha=args.alpha)

    rollouts = RolloutStorage(args.num_steps,
                              args.num_processes,
                              obs_shape,
                              envs.action_space,
                              actor_critic.state_size,
                              gamma=args.gamma,
                              use_rp=args.reward_predictor)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            obs, raw_reward, done, info = envs.step(cpu_actions)

            if args.reward_noise > 0.0:
                stds = np.ones(raw_reward.shape) * args.reward_noise
                noise = np.random.normal(loc=0.0, scale=stds)
                reward = raw_reward + noise
            else:
                reward = raw_reward

            raw_reward = torch.from_numpy(
                np.expand_dims(np.stack(raw_reward), 1)).float()

            episode_rewards += raw_reward

            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()

            if args.reward_predictor:
                p_hat = min(args.rp_burn_in, j) / args.rp_burn_in
                estimate_reward = (
                    1 - p_hat
                ) * reward + p_hat * value[:, 0].unsqueeze(-1).data.cpu()
                reward = torch.cat([reward, estimate_reward], dim=-1)
                value = value.data
            else:
                value = value.data

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)

            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, value, reward, masks)

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value)

        states = Variable(rollouts.states[0].view(-1, actor_critic.state_size))
        masks = Variable(rollouts.masks[:-1].view(-1, 1))
        obs = Variable(rollouts.observations[:-1].view(-1, *obs_shape))
        actions = Variable(rollouts.actions.view(-1, action_shape))

        values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
            obs, states, masks, actions)
        returns_as_variable = Variable(rollouts.returns[:-1])

        values = values.view(returns_as_variable.size())
        action_log_probs = action_log_probs.view(args.num_steps,
                                                 args.num_processes, 1)

        advantages = returns_as_variable - values

        value_loss = advantages.pow(2).sum(-1).mean()
        action_loss = -(Variable(advantages[:, :, -1].unsqueeze(-1).data) *
                        action_log_probs).mean()

        optimizer.zero_grad()
        (value_loss * args.value_loss_coef + action_loss -
         dist_entropy * args.entropy_coef).backward()

        nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

        optimizer.step()

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, 'a2c')
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, "
                "entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".format(
                    j, total_num_steps, int(total_num_steps / (end - start)),
                    final_rewards.mean(), final_rewards.median(),
                    final_rewards.min(), final_rewards.max(),
                    dist_entropy.data[0], value_loss.data[0],
                    action_loss.data[0]))
Esempio n. 14
0
def main():
    print("###############################################################")
    print("#################### VISDOOM LEARNER START ####################")
    print("###############################################################")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    global envs
    envs = VecEnv(
        [make_env(i, args.config_path) for i in range(args.num_processes)],
        logging=True,
        log_dir=args.log_dir)

    obs_shape = envs.observation_space_shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if args.algo == 'a2c' or args.algo == 'acktr':
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space_shape)
    elif args.algo == 'a2t':
        source_models = []
        files = glob.glob(os.path.join(args.source_models_path, '*.pt'))
        for file in files:
            print(file, 'loading model...')
            source_models.append(torch.load(file))
        actor_critic = A2TPolicy(obs_shape[0], envs.action_space_shape,
                                 source_models)
    elif args.algo == 'resnet':
        # args.num_stack = 3
        actor_critic = ResnetPolicy(obs_shape[0], envs.action_space_shape)

    action_shape = 1

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c' or args.algo == 'resnet':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'a2t':
        a2t_params = [p for p in actor_critic.parameters() if p.requires_grad]
        optimizer = optim.RMSprop(a2t_params,
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space_shape)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space_shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()
            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            # print ('Actions:', cpu_actions, 'Rewards:', reward)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, action.data, value.data, reward,
                            masks)

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True))[0].data

        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.observations[:-1].view(
                -1, *obs_shape))

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if args.algo in ['a2c', 'acktr', 'a2t', 'resnet']:
            values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(
                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                Variable(rollouts.actions.view(-1, action_shape)))

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()

            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values -
                                   Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c' or args.algo == 'resnet':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)
            elif args.algo == 'a2t':
                nn.utils.clip_grad_norm(a2t_params, args.max_grad_norm)

            optimizer.step()

        rollouts.observations[0].copy_(rollouts.observations[-1])

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            envs.log()
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        -dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, 'VizDoom', args.algo)
            except IOError:
                pass
    envs.close()
    time.sleep(5)
Esempio n. 15
0
def main():
    print("###############################################################")
    print("#################### VIZDOOM LEARNER START ####################")
    print("###############################################################")

    save_path = os.path.join(args.save_dir, str(args.exp_id))
    log_path = os.path.join(args.log_dir, str(args.exp_id))
    num_updates = int(args.num_frames) // args.num_steps // args.num_processes
    reward_name = ""
    if args.roe:
        reward_name = "_event"
    scenario_name = args.config_path.split("/")[1].split(".")[0]
    print("############### " + scenario_name + " ###############")
    log_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str(
        args.exp_id) + "_" + str(args.agent_id) + ".log"
    #log_event_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str(args.exp_id) + "_" + str(args.agent_id) + ".eventlog"
    #log_event_reward_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str(args.exp_id) + "_" + str(args.agent_id) + ".eventrewardlog"
    start_updates = 0
    start_step = 0
    best_final_rewards = -1000000.0

    os.environ['OMP_NUM_THREADS'] = '1'

    cig = "cig" in args.config_path
    global envs
    es = [
        make_env(i, args.config_path, visual=args.visual, cig=cig)
        for i in range(args.num_processes)
    ]
    envs = VecEnv([es[i] for i in range(args.num_processes)])

    obs_shape = envs.observation_space_shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if args.resume:
        actor_critic = torch.load(
            os.path.join(save_path, f"{args.agent_id}.pt"))
        filename = glob.glob(os.path.join(log_path, log_file_name))[0]
        with open(filename) as file:
            lines = file.readlines()
            start_updates = (int)(lines[-1].strip().split(",")[0])
            start_steps = (int)(lines[-1].strip().split(",")[1])
            num_updates += start_updates
    else:
        try:
            os.makedirs(save_path)
        except OSError:
            pass
        try:
            os.makedirs(log_path)
        except OSError:
            files = glob.glob(os.path.join(args.log_dir, log_file_name))
            for f in files:
                os.remove(f)
            #with open(log_file_name, "w") as myfile:
            #    myfile.write("")
            #files = glob.glob(os.path.join(args.log_dir, log_event_file_name))
            #for f in files:
            #    os.remove(f)
            #with open(log_event_file_name, "w") as myfile:
            #    myfile.write("")
            #files = glob.glob(os.path.join(args.log_dir, log_event_reward_file_name))
            #for f in files:
            #    os.remove(f)
            #with open(log_event_reward_file_name, "w") as myfile:
            #    myfile.write("")
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space_shape)

    action_shape = 1

    if args.cuda:
        actor_critic.cuda()

    optimizer = optim.RMSprop(actor_critic.parameters(),
                              args.lr,
                              eps=args.eps,
                              alpha=args.alpha)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space_shape)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space_shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)
    last_game_vars = []
    for i in range(args.num_processes):
        last_game_vars.append(np.zeros(args.num_events))

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])
    episode_intrinsic_rewards = torch.zeros([args.num_processes, 1])
    final_intrinsic_rewards = torch.zeros([args.num_processes, 1])
    episode_events = torch.zeros([args.num_processes, args.num_events])
    final_events = torch.zeros([args.num_processes, args.num_events])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    def mean_distance_to_nearest_neighbor(elite_events):
        d = []
        nearest = None
        for a in range(len(elite_events)):
            for b in range(len(elite_events)):
                if a != b:
                    elite_a = elite_events[a]
                    elite_b = elite_events[b]
                    dist = np.linalg.norm(elite_a - elite_b)
                    if nearest is None or dist < nearest:
                        nearest = dist
            if nearest is not None:
                d.append(nearest)
            nearest = None
        return np.mean(d)

    def distance_to_nearest_neighbor(elite_events, events):
        nearest = None
        for elite_a in elite_events:
            dist = np.linalg.norm(elite_a - events)
            if nearest is None or dist < nearest:
                nearest = dist
        return nearest

    def add_to_archive(frame, episode_length):
        #print("Final rewards: ", final_rewards.numpy())
        fitness = final_rewards.numpy().mean()
        #print("raw: ", final_events.numpy())
        behavior = final_events.numpy().mean(axis=0)
        #print("Fitness:", fitness)
        #print("Behavior:", behavior)
        neighbors = event_buffer.get_neighbors(behavior, args.niche_divs,
                                               episode_length)

        add = len(neighbors) == 0
        for neighbor in neighbors:
            if fitness > neighbor.fitness:
                add = True
            else:
                add = False
                break

        if add:
            if len(neighbors) > 0:
                event_buffer.remove_elites(neighbors)
                #print(f"- Removing elites {[neighbor.elite_id for neighbor in neighbors]}")
            for neighbor in neighbors:
                try:
                    #print(f"- Deleting model {neighbor.elite_id}")
                    os.remove(
                        os.path.join(save_path, f"{neighbor.elite_id}.pt"))
                    #print("Successfully deleted model with id : ", neighbor.elite_id)
                except:
                    print("Error while deleting model with id : ",
                          neighbor.elite_id)
            name = str(uuid.uuid1())
            #print("Adding elite")
            event_buffer.add_elite(name, behavior, fitness, frame,
                                   episode_length)
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            torch.save(save_model, os.path.join(save_path, f"{name}.pt"))

    # Create event buffer
    event_buffer = EventBufferSQLProxy(args.num_events,
                                       args.capacity,
                                       args.exp_id,
                                       args.agent_id,
                                       qd=args.qd,
                                       per_step=args.per_step)

    event_episode_rewards = []

    episode_finished = np.zeros(args.num_processes)

    start = time.time()
    for j in np.arange(start_updates, num_updates):
        for step in range(args.num_steps):

            value, action = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()
            obs, reward, done, info, events = envs.step(cpu_actions)
            intrinsic_reward = []

            # Fix broken rewards - upscale
            for i in range(len(reward)):
                if scenario_name in ["deathmatch", "my_way_home"]:
                    reward[i] *= 100
                if scenario_name == "deadly_corridor":
                    reward[i] = 1 if events[i][2] >= 1 else 0

            for e in events:
                if args.roe:
                    ir = event_buffer.intrinsic_reward(e)
                    if args.per_step:
                        ir = ir / 4200
                    intrinsic_reward.append(ir)
                else:
                    r = reward[len(intrinsic_reward)]
                    intrinsic_reward.append(r)

            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            intrinsic_reward = torch.from_numpy(
                np.expand_dims(np.stack(intrinsic_reward), 1)).float()
            #events = torch.from_numpy(np.expand_dims(np.stack(events), args.num_events)).float()
            events = torch.from_numpy(events).float()
            episode_rewards += reward
            episode_intrinsic_rewards += intrinsic_reward
            episode_events += events

            # Event stats
            '''
            event_rewards = []
            for ei in range(0,args.num_events):
                ev = np.zeros(args.num_events)
                ev[ei] = 1
                er = event_buffer.intrinsic_reward(ev)
                if args.per_step:
                    er = er / 4200
                er = event_buffer.intrinsic_reward(ev)
                event_rewards.append(er)

            event_episode_rewards.append(event_rewards)
            '''

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_intrinsic_rewards *= masks
            final_events *= masks
            final_rewards += (1 - masks) * episode_rewards
            final_intrinsic_rewards += (1 - masks) * episode_intrinsic_rewards
            final_events += (1 - masks) * episode_events

            for i in range(args.num_processes):
                if done[i]:
                    #event_buffer.record_events(np.copy(final_events[i].numpy()), frame=j*args.num_steps*args.num_processes)
                    episode_length = (step +
                                      j * args.num_steps) - episode_finished[i]
                    episode_finished[i] = episode_length + episode_finished[i]
                    add_to_archive(
                        step * args.num_processes +
                        j * args.num_steps * args.num_processes,
                        episode_length)

            episode_rewards *= masks
            episode_intrinsic_rewards *= masks
            episode_events *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)

            rollouts.insert(step, current_obs, action.data, value.data,
                            intrinsic_reward, masks)

        #final_episode_reward = np.mean(event_episode_rewards, axis=0)
        #event_episode_rewards = []

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True))[0].data

        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.observations[:-1].view(
                -1, *obs_shape))

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(
            Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
            Variable(rollouts.actions.view(-1, action_shape)))

        values = values.view(args.num_steps, args.num_processes, 1)
        action_log_probs = action_log_probs.view(args.num_steps,
                                                 args.num_processes, 1)
        advantages = Variable(rollouts.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(Variable(advantages.data) * action_log_probs).mean()

        optimizer.zero_grad()
        (value_loss * args.value_loss_coef + action_loss -
         dist_entropy * args.entropy_coef).backward()

        nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

        optimizer.step()

        rollouts.observations[0].copy_(rollouts.observations[-1])

        if j % args.log_interval == 0:

            envs.log()
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            log = "Updates {}, num timesteps {}, FPS {}, mean/max reward {:.5f}/{:.5f}, mean/max intrinsic reward {:.5f}/{:.5f}"\
                .format(j, total_num_steps,
                            int(total_num_steps / (end - start)),
                            final_rewards.mean(),
                            final_rewards.max(),
                            final_intrinsic_rewards.mean(),
                            final_intrinsic_rewards.max()
                        )

            log_to_file = "{}, {}, {:.5f}, {:.5f}, {:.5f}, {:.5f}\n" \
                .format(j, total_num_steps,
                        final_rewards.mean(),
                        final_rewards.std(),
                        final_intrinsic_rewards.mean(),
                        final_intrinsic_rewards.std())

            with open(os.path.join(log_path, log_file_name), "a") as myfile:
                myfile.write(log_to_file)

            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            torch.save(save_model,
                       os.path.join(save_path, f"{args.agent_id}.pt"))

            print(log)

    envs.close()
    time.sleep(5)
Esempio n. 16
0
            for i in range(args.num_processes)
        ]) for j in range(args.num_heads)
    ]

    envs_teacher = [
        SubprocVecEnv([
            make_env(args.env_name[j], args.seed, i, log_dir_teacher[j])
            for i in range(args.num_processes)
        ]) for j in range(args.num_heads)
    ]

    obs_shape = envs_teacher[0].observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if len(envs_teacher[0].observation_space.shape) == 3:
        teacher = CNNPolicy(obs_shape[0], envs_teacher.action_space)
        # TODO: change student
        student = CNNPolicy(obs_shape[0], envs_student_train.action_space)
    else:
        teacher = [
            MLPPolicy(obs_shape[0], envs_teacher[i].action_space)
            for i in range(args.num_heads)
        ]
        # TODO: change student
        student = MultiHead_MLPPolicy(obs_shape[0],
                                      envs_student_train[0].action_space,
                                      num_heads=args.num_heads)

    # load teacher model from checkpoint
    for i in range(args.num_heads):
        assert os.path.exists(args.checkpoint[i])
Esempio n. 17
0
def main():
    print("###############################################################")
    print("#################### VIZDOOM LEARNER START ####################")
    print("###############################################################")

    save_path = os.path.join(args.save_dir, "a2c")
    num_updates = int(args.num_frames) // args.num_steps // args.num_processes
    reward_name = ""
    if args.roe:
        reward_name = "_event"
    scenario_name = args.config_path.split("/")[1].split(".")[0]
    print("############### " + scenario_name + " ###############")
    log_file_name = "log/vizdoom_" + scenario_name + reward_name + "_" + str(
        args.agent_id) + ".log"
    log_event_file_name = "log/vizdoom_" + scenario_name + reward_name + "_" + str(
        args.agent_id) + ".eventlog"
    log_event_reward_file_name = "log/vizdoom_" + scenario_name + reward_name + "_" + str(
        args.agent_id) + ".eventrewardlog"
    start_updates = 0
    start_step = 0
    best_final_rewards = -1000000.0

    os.environ['OMP_NUM_THREADS'] = '1'

    # ---- DOOM Environments

    # Host:
    game = DoomGame()

    # Use CIG example config or your own.
    game.load_config("../../scenarios/cig.cfg")

    game.set_doom_map("map01")  # Limited deathmatch.
    #game.set_doom_map("map02")  # Full deathmatch.

    # Host game with options that will be used in the competition.
    game.add_game_args(
        "-host 2 "  # This machine will function as a host for a multiplayer game with this many players (including this machine). It will wait for other machines to connect using the -join parameter and then start the game when everyone is connected.
        "-deathmatch "  # Deathmatch rules are used for the game.
        "+timelimit 10.0 "  # The game (episode) will end after this many minutes have elapsed.
        "+sv_forcerespawn 1 "  # Players will respawn automatically after they die.
        "+sv_noautoaim 1 "  # Autoaim is disabled for all players.
        "+sv_respawnprotect 1 "  # Players will be invulnerable for two second after spawning.
        "+sv_spawnfarthest 1 "  # Players will be spawned as far as possible from any other players.
        "+sv_nocrouch 1 "  # Disables crouching.
        "+viz_respawn_delay 10 "  # Sets delay between respanws (in seconds).
        "+viz_nocheat 1"
    )  # Disables depth and labels buffer and the ability to use commands that could interfere with multiplayer game.

    # This can be used to host game without taking part in it (can be simply added as argument of vizdoom executable).
    game.add_game_args("viz_spectator 1")

    # During the competition, async mode will be forced for all agents.
    game.set_mode(Mode.ASYNC_PLAYER)

    # Start game
    game.init()

    # Clients:
    global envs
    es = [
        make_cig_env(i, visual=args.visual) for i in range(args.num_processes)
    ]
    envs = VecEnv([es[i] for i in range(args.num_processes)])

    obs_shape = envs.observation_space_shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if args.resume:
        actor_critic = torch.load(
            os.path.join(save_path, log_file_name + ".pt"))
        filename = glob.glob(os.path.join(args.log_dir, log_file_name))[0]
        #if args.roe:
        # TODO: Load event buffer
        with open(filename) as file:
            lines = file.readlines()
            start_updates = (int)(lines[-1].strip().split(",")[0])
            start_steps = (int)(lines[-1].strip().split(",")[1])
            num_updates += start_updates
    else:
        if not args.debug:
            try:
                os.makedirs(args.log_dir)
            except OSError:
                files = glob.glob(os.path.join(args.log_dir, log_file_name))
                for f in files:
                    os.remove(f)
                with open(log_file_name, "w") as myfile:
                    myfile.write("")
                files = glob.glob(
                    os.path.join(args.log_dir, log_event_file_name))
                for f in files:
                    os.remove(f)
                with open(log_event_file_name, "w") as myfile:
                    myfile.write("")
                files = glob.glob(
                    os.path.join(args.log_dir, log_event_reward_file_name))
                for f in files:
                    os.remove(f)
                with open(log_event_reward_file_name, "w") as myfile:
                    myfile.write("")
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space_shape)

    action_shape = 1

    if args.cuda:
        actor_critic.cuda()

    optimizer = optim.RMSprop(actor_critic.parameters(),
                              args.lr,
                              eps=args.eps,
                              alpha=args.alpha)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space_shape)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space_shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)
    last_game_vars = []
    for i in range(args.num_processes):
        last_game_vars.append(np.zeros(args.num_events))

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])
    episode_intrinsic_rewards = torch.zeros([args.num_processes, 1])
    final_intrinsic_rewards = torch.zeros([args.num_processes, 1])
    episode_events = torch.zeros([args.num_processes, args.num_events])
    final_events = torch.zeros([args.num_processes, args.num_events])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    # Create event buffer
    if args.qd:
        event_buffer = EventBufferSQLProxy(args.num_events, args.capacity,
                                           args.exp_id, args.agent_id)
    elif not args.resume:
        event_buffer = EventBuffer(args.num_events, args.capacity)
    else:
        event_buffer = pickle.load(
            open(log_file_name + "_event_buffer_temp.p", "rb"))

    event_episode_rewards = []

    start = time.time()
    for j in np.arange(start_updates, num_updates):
        for step in range(args.num_steps):

            value, action = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()
            obs, reward, done, info, events = envs.step(cpu_actions)
            intrinsic_reward = []

            # Fix broken rewards - upscale
            for i in range(len(reward)):
                if scenario_name in ["deathmatch", "my_way_home"]:
                    reward[i] *= 100
                if scenario_name == "deadly_corridor":
                    reward[i] = 1 if events[i][2] >= 1 else 0

            for e in events:
                if args.roe:
                    intrinsic_reward.append(event_buffer.intrinsic_reward(e))
                else:
                    r = reward[len(intrinsic_reward)]
                    intrinsic_reward.append(r)

            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            intrinsic_reward = torch.from_numpy(
                np.expand_dims(np.stack(intrinsic_reward), 1)).float()
            #events = torch.from_numpy(np.expand_dims(np.stack(events), args.num_events)).float()
            events = torch.from_numpy(events).float()
            episode_rewards += reward
            episode_intrinsic_rewards += intrinsic_reward
            episode_events += events

            # Event stats
            event_rewards = []
            for ei in range(0, args.num_events):
                ev = np.zeros(args.num_events)
                ev[ei] = 1
                er = event_buffer.intrinsic_reward(ev)
                event_rewards.append(er)

            event_episode_rewards.append(event_rewards)

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_intrinsic_rewards *= masks
            final_events *= masks
            final_rewards += (1 - masks) * episode_rewards
            final_intrinsic_rewards += (1 - masks) * episode_intrinsic_rewards
            final_events += (1 - masks) * episode_events

            for i in range(args.num_processes):
                if done[i]:
                    event_buffer.record_events(np.copy(
                        final_events[i].numpy()),
                                               frame=j * args.num_steps)

            episode_rewards *= masks
            episode_intrinsic_rewards *= masks
            episode_events *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)

            rollouts.insert(step, current_obs, action.data, value.data,
                            intrinsic_reward, masks)

        final_episode_reward = np.mean(event_episode_rewards, axis=0)
        event_episode_rewards = []

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True))[0].data

        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.observations[:-1].view(
                -1, *obs_shape))

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(
            Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
            Variable(rollouts.actions.view(-1, action_shape)))

        values = values.view(args.num_steps, args.num_processes, 1)
        action_log_probs = action_log_probs.view(args.num_steps,
                                                 args.num_processes, 1)
        advantages = Variable(rollouts.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(Variable(advantages.data) * action_log_probs).mean()

        optimizer.zero_grad()
        (value_loss * args.value_loss_coef + action_loss -
         dist_entropy * args.entropy_coef).backward()

        nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

        optimizer.step()

        rollouts.observations[0].copy_(rollouts.observations[-1])

        if final_rewards.mean() > best_final_rewards and not args.debug:
            try:
                os.makedirs(save_path)
            except OSError:
                pass
            best_final_rewards = final_rewards.mean()
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            torch.save(
                save_model,
                os.path.join(save_path,
                             log_file_name.split(".log")[0] + ".pt"))

        if j % args.save_interval == 0 and args.save_dir != "" and not args.debug:
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            torch.save(save_model,
                       os.path.join(save_path, log_file_name + "_temp.pt"))
            if isinstance(event_buffer, EventBuffer):
                pickle.dump(event_buffer,
                            open(log_file_name + "_event_buffer_temp.p", "wb"))

        if j % args.log_interval == 0:

            envs.log()
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            log = "Updates {}, num timesteps {}, FPS {}, mean/max reward {:.5f}/{:.5f}, mean/max intrinsic reward {:.5f}/{:.5f}"\
                .format(j, total_num_steps,
                            int(total_num_steps / (end - start)),
                            final_rewards.mean(),
                            final_rewards.max(),
                            final_intrinsic_rewards.mean(),
                            final_intrinsic_rewards.max()
                        )
            log_to_file = "{}, {}, {:.5f}, {:.5f}, {:.5f}, {:.5f}\n" \
                .format(j, total_num_steps,
                        final_rewards.mean(),
                        final_rewards.std(),
                        final_intrinsic_rewards.mean(),
                        final_intrinsic_rewards.std())
            log_to_event_file = ','.join(
                map(str,
                    event_buffer.get_event_mean().tolist())) + "\n"
            log_to_event_reward_file = ','.join(
                map(str,
                    event_buffer.get_event_rewards().tolist())) + "\n"
            print(log)
            print(log_to_event_file)

            # Save to files
            with open(log_file_name, "a") as myfile:
                myfile.write(log_to_file)
            with open(log_event_file_name, "a") as myfile:
                myfile.write(str(total_num_steps) + "," + log_to_event_file)
            with open(log_event_reward_file_name, "a") as myfile:
                myfile.write(
                    str(total_num_steps) + "," + log_to_event_reward_file)

    envs.close()
    game.close()
    time.sleep(5)
Esempio n. 18
0
except OSError:
    files = glob.glob(os.path.join(args.log_dir, '*.monitor.json'))
    for f in files:
        os.remove(f)

os.environ['OMP_NUM_THREADS'] = '1'

envs = SubprocVecEnv([
    make_env(args.env_name, args.seed, i, args.log_dir)
    for i in range(args.num_processes)
])

obs_shape = envs.observation_space.shape
obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

actor_critic = CNNPolicy(obs_shape[0], envs.action_space)

if envs.action_space.__class__.__name__ == "Discrete":
    action_shape = 1
else:
    action_shape = envs.action_space.shape[0]

if args.cuda:
    actor_critic.cuda()

optimizer = optim.RMSprop(actor_critic.parameters(),
                          args.lr,
                          eps=args.eps,
                          alpha=args.alpha)

rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
Esempio n. 19
0
def main():
    print("#######")
    print("WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards")
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    names = getListOfGames("train")

    envs = [make_env_train(names[i], args.seed, i, args.log_dir)
                for i in range(len(names))]
                
    # TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO
    args.num_processes = len(envs)
    # REMEMBER YOU CHENGED IT

    if len(envs) > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    #print(obs_shape)
    obs_shape = (obs_shape[0], *obs_shape[1:])
    #print(obs_shape)

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    # Making it paralel
    actor_critic = torch.nn.parallel.DataParallel(actor_critic).module

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
       actor_critic.cuda()

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, lr=args.lr,
                               eps=args.eps, alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
                         args.value_loss_coef, args.entropy_coef, lr=args.lr,
                               eps=args.eps,
                               max_grad_norm=args.max_grad_norm)
        # Make agent DataParallel
        agent = torch.nn.parallel.DataParallel(agent).module

    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, acktr=True)

    # Make rollouts DataParallel
    rollouts = torch.nn.parallel.DataParallel(RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size)).module
    current_obs = torch.nn.parallel.DataParallel(torch.zeros(envs.nenvs, *obs_shape)).module

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        # if args.num_stack > 1:
        #     current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                    Variable(rollouts.observations[step], volatile=True),
                    Variable(rollouts.states[step], volatile=True),
                    Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks)

        next_value = actor_critic.get_value(Variable(rollouts.observations[-1], volatile=True),
                                            Variable(rollouts.states[-1], volatile=True),
                                            Variable(rollouts.masks[-1], volatile=True)).data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [save_model,
                            hasattr(envs, 'ob_rms') and envs.ob_rms or None]

            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(j, total_num_steps,
                       int(total_num_steps / (end - start)),
                       final_rewards.mean(),
                       final_rewards.median(),
                       final_rewards.min(),
                       final_rewards.max(), dist_entropy.data[0],
                       value_loss.data[0], action_loss.data[0]))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
Esempio n. 20
0
def main():
    os.environ['OMP_NUM_THREADS'] = '1'
    num_updates = int(args.num_frames) // args.num_steps // args.num_processes

    logger = ModelLogger(args.log_dir, args.num_processes)

    envs = make_env_vec(args.num_processes,
                        args.env_name,
                        args.seed,
                        args.log_dir,
                        args.start_container,
                        max_steps=1200,
                        discrete_wrapper=args.discrete_actions)

    action_shape = 1 if envs.action_space.__class__.__name__ == "Discrete" else envs.action_space.shape[
        0]
    obs_shape = (envs.observation_space.shape[0] * args.num_stack,
                 *envs.observation_space.shape[1:])
    distribution = 'MixedDistribution' if args.use_mixed else 'DiagGaussian'
    actor_critic = CNNPolicy(obs_shape[0],
                             envs.action_space,
                             args.recurrent_policy,
                             distribution=distribution)
    print_model_size(actor_critic)
    print(obs_shape)
    if args.cuda: actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)
    obs = envs.reset()
    current_obs = update_current_obs(obs, current_obs,
                                     envs.observation_space.shape[0],
                                     args.num_stack)
    rollouts.observations[0].copy_(current_obs)

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        #Running an episode
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step]),
                Variable(rollouts.states[step]),
                Variable(rollouts.masks[step]))

            continous_action, discrete_action = action
            continous_action_log_prob, discrete_action_log_prob = action_log_prob

            cpu_actions = continous_action.data.squeeze(1).cpu().numpy()
            # Exploration epsilon greedy, ToDo better exploration policy
            #if np.random.random_sample() < args.exp_probability:
            #cpu_actions = [envs.action_space.sample() for _ in range(args.num_processes)]

            # Observation, reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)

            #ToDo better collision strategy
            '''for i, flag in enumerate(done):
                if flag == True:
                    envs.envs[i].user_tile_start = info[i]['Simulator']['tile_coords']
                    envs.envs[i].reset()
                    envs.envs[i].user_tile_start = None'''
            slack = args.reward_slack
            scaled_reward = np.clip(reward + slack,
                                    a_min=-2.0**args.reward_pow,
                                    a_max=None)
            #scaled_reward = np.clip(reward + slack, a_min = -40.0, a_max=None)
            for i in range(args.num_processes):
                if scaled_reward[i] > 0:
                    scaled_reward[i] = (1 +
                                        scaled_reward[i])**args.reward_pow - 1
            scaled_reward = torch.from_numpy(
                np.expand_dims(np.stack(scaled_reward), 1)).float()
            '''if step != 0:
                cur_angle = continous_action[:, 1]
                scaled_reward -= (torch.abs(prev_angle - cur_angle).view(-1).data.cpu()**args.reward_facpow)*args.reward_factor
            prev_angle = continous_action[:, 1]'''

            reward = np.clip(reward, a_min=-4.0, a_max=None) + 1.0
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()

            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])

            logger.update_reward(reward, masks)

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            current_obs = update_current_obs(obs, current_obs,
                                             envs.observation_space.shape[0],
                                             args.num_stack)
            rollouts.insert(step, current_obs, states.data,
                            discrete_action.data,
                            discrete_action_log_prob.data,
                            continous_action.data,
                            continous_action_log_prob.data, value.data,
                            scaled_reward, masks)

        next_value = actor_critic(Variable(rollouts.observations[-1]),
                                  Variable(rollouts.states[-1]),
                                  Variable(rollouts.masks[-1]))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        #Performing Actor Critic Updates
        if args.algo in ['a2c']:

            recurrence_steps = 1
            if args.recurrent_policy: recurrence_steps = args.num_recsteps

            indices = torch.arange(0, args.num_steps, recurrence_steps).long()
            if args.cuda: indices = indices.cuda()

            total_value_loss = 0
            total_action_loss = 0
            total_dist_entropy = 0
            total_loss = 0

            for rstep in range(recurrence_steps):

                values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                    Variable(rollouts.observations[:-1].index_select(
                        0, indices).view(-1, *obs_shape)),
                    Variable(rollouts.states[:-1].index_select(
                        0, indices).view(-1, actor_critic.state_size)),
                    Variable(rollouts.masks[:-1].index_select(0, indices).view(
                        -1, 1)), [
                            Variable(
                                rollouts.continuous_actions.index_select(
                                    0, indices).view(-1, action_shape)),
                            Variable(
                                rollouts.discrete_actions.index_select(
                                    0, indices).view(-1, 1))
                        ])

                continuous_dist_entropy, discrete_dist_entropy = dist_entropy
                continuous_action_log_probs, discrete_action_log_probs = action_log_probs
                values = values.view(int(args.num_steps / recurrence_steps),
                                     args.num_processes, 1)
                continuous_action_log_probs = continuous_action_log_probs.view(
                    int(args.num_steps / recurrence_steps), args.num_processes,
                    1)
                discrete_action_log_probs = discrete_action_log_probs.view(
                    int(args.num_steps / recurrence_steps), args.num_processes,
                    1)

                advantages = Variable(rollouts.returns[:-1].index_select(
                    0, indices)) - values
                value_loss = advantages.pow(2).mean()
                '''action_loss = -(Variable(advantages.data) * (0.3*continuous_action_log_probs + 0.7*discrete_action_log_probs)).mean()'''
                '''action_loss = -(Variable(advantages.data) * ((j%2)*continuous_action_log_probs 
                     + ((j+1)%2)*discrete_action_log_probs)).mean()'''

                action_loss = -(Variable(advantages.data) *
                                continuous_action_log_probs).mean()

                #loss = value_loss * args.value_loss_coef + action_loss - discrete_dist_entropy * args.entropy_coef
                loss = value_loss * args.value_loss_coef + action_loss

                total_value_loss += value_loss.data[0]
                total_action_loss += action_loss.data[0]
                #total_dist_entropy += discrete_dist_entropy.data[0]
                total_loss += loss

                indices += 1

            total_value_loss /= recurrence_steps
            total_action_loss /= recurrence_steps
            #total_dist_entropy /= recurrence_steps
            total_loss /= recurrence_steps

            optimizer.zero_grad()
            total_loss.backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)

            optimizer.step()

        rollouts.after_update()

        #Saving the model
        if j % args.save_interval == 0 and args.save_dir != "":
            logger.save_model(args.save_dir, actor_critic, envs, args.algo,
                              args.env_name, args.name, args.cuda)

        #Logging the model
        if j % args.log_interval == 0:
            logger.print_log(total_value_loss, total_action_loss,
                             total_dist_entropy, args.num_processes,
                             args.num_steps, j, start)
Esempio n. 21
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = SubprocVecEnv([
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ])

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space)
    else:
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(),
                               args.lr,
                               eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space)
    current_state = torch.zeros(args.num_processes, *obs_shape)

    def update_current_state(state):
        shape_dim0 = envs.observation_space.shape[0]
        state = torch.from_numpy(state).float()
        if args.num_stack > 1:
            current_state[:, :-shape_dim0] = current_state[:, shape_dim0:]
        current_state[:, -shape_dim0:] = state

    state = envs.reset()
    update_current_state(state)

    rollouts.states[0].copy_(current_state)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_state = current_state.cuda()
        rollouts.cuda()

    if args.algo == 'ppo':
        old_model = copy.deepcopy(actor_critic)

    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action = actor_critic.act(
                Variable(rollouts.states[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next state
            state, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_state.dim() == 4:
                current_state *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_state *= masks

            update_current_state(state)
            rollouts.insert(step, current_state, action.data, value.data,
                            reward, masks)

        next_value = actor_critic(Variable(rollouts.states[-1],
                                           volatile=True))[0].data

        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.states[:-1].view(
                -1, *obs_shape))

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(
                Variable(rollouts.states[:-1].view(-1, *obs_shape)),
                Variable(rollouts.actions.view(-1, action_shape)))

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()

            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values -
                                   Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)

            optimizer.step()
        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)

            old_model.load_state_dict(actor_critic.state_dict())
            if hasattr(actor_critic, 'obs_filter'):
                old_model.obs_filter = actor_critic.obs_filter

            for _ in range(args.ppo_epoch):
                sampler = BatchSampler(SubsetRandomSampler(
                    range(args.num_processes * args.num_steps)),
                                       args.batch_size * args.num_processes,
                                       drop_last=False)
                for indices in sampler:
                    indices = torch.LongTensor(indices)
                    if args.cuda:
                        indices = indices.cuda()
                    states_batch = rollouts.states[:-1].view(
                        -1, *obs_shape)[indices]
                    actions_batch = rollouts.actions.view(
                        -1, action_shape)[indices]
                    return_batch = rollouts.returns[:-1].view(-1, 1)[indices]

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(
                        Variable(states_batch), Variable(actions_batch))

                    _, old_action_log_probs, _ = old_model.evaluate_actions(
                        Variable(states_batch, volatile=True),
                        Variable(actions_batch, volatile=True))

                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs.data))
                    adv_targ = Variable(advantages.view(-1, 1)[indices])
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) -
                                  values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss -
                     dist_entropy * args.entropy_coef).backward()
                    optimizer.step()

        rollouts.states[0].copy_(rollouts.states[-1])

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            print(
                "Updates {}, num frames {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, (j + 1) * args.num_processes * args.num_steps,
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        -dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))

        if j % args.vis_interval == 0:
            win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo)
Esempio n. 22
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    obs_numel = reduce(operator.mul, obs_shape, 1)

    if len(obs_shape) == 3 and obs_numel > 1024:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space,
                                 args.recurrent_policy)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_numel, envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(),
                               args.lr,
                               eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, value.data, reward, masks)

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                Variable(rollouts.masks[:-1].view(-1, 1)),
                Variable(rollouts.actions.view(-1, action_shape)))

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()

            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values -
                                   Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)

            optimizer.step()
        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)

            for e in range(args.ppo_epoch):
                if args.recurrent_policy:
                    data_generator = rollouts.recurrent_generator(
                        advantages, args.num_mini_batch)
                else:
                    data_generator = rollouts.feed_forward_generator(
                        advantages, args.num_mini_batch)

                for sample in data_generator:
                    observations_batch, states_batch, actions_batch, \
                       return_batch, masks_batch, old_action_log_probs_batch, \
                            adv_targ = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                        Variable(observations_batch), Variable(states_batch),
                        Variable(masks_batch), Variable(actions_batch))

                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) -
                                  values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss -
                     dist_entropy * args.entropy_coef).backward()
                    nn.utils.clip_grad_norm(actor_critic.parameters(),
                                            args.max_grad_norm)
                    optimizer.step()

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
            except IOError:
                pass
Esempio n. 23
0
def main():
    print("#######")
    print("WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards")
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'
    #os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    #os.environ['CUDA_VISIBLE_DEVICES'] = "9"
    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = [make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep)
                for i in range(args.num_processes)]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space,args.hid_size, args.feat_size,args.recurrent_policy)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]
    if args.use_cell:
        hs = HistoryCell(obs_shape[0], actor_critic.feat_size, 2*actor_critic.hidden_size, 1)
        ft = FutureCell(obs_shape[0], actor_critic.feat_size, 2 * actor_critic.hidden_size, 1)
    else:
        hs = History(obs_shape[0], actor_critic.feat_size, actor_critic.hidden_size, 2, 1)
        ft = Future(obs_shape[0], actor_critic.feat_size, actor_critic.hidden_size, 2, 1)

    if args.cuda:
        actor_critic=actor_critic.cuda()
        hs = hs.cuda()
        ft = ft.cuda()
    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, lr=args.lr,
                               eps=args.eps, alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic, hs,ft,args.clip_param, args.ppo_epoch, args.num_mini_batch,
                         args.value_loss_coef, args.entropy_coef, args.hf_loss_coef,ac_lr=args.lr,hs_lr=args.lr,ft_lr=args.lr,
                                eps=args.eps,
                                max_grad_norm=args.max_grad_norm,
                                num_processes=args.num_processes,
                                num_steps=args.num_steps,
                                use_cell=args.use_cell,
                                lenhs=args.lenhs,lenft=args.lenft,
                                plan=args.plan,
                                ac_intv=args.ac_interval,
                                hs_intv=args.hs_interval,
                                ft_intv=args.ft_interval
                                )
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size,
                              feat_size=512)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)


    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    rec_x = []
    rec_y = []
    file = open('./rec/' + args.env_name + '_' + args.method_name + '.txt', 'w')

    hs_info = torch.zeros(args.num_processes, 2 * actor_critic.hidden_size).cuda()
    hs_ind = torch.IntTensor(args.num_processes, 1).zero_()

    epinfobuf = deque(maxlen=100)
    start_time = time.time()
    for j in range(num_updates):
        print('begin sample, time  {}'.format(time.strftime("%Hh %Mm %Ss",
                                                                time.gmtime(time.time() - start_time))))
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                rollouts.feat[step]=actor_critic.get_feat(rollouts.observations[step])

                if args.use_cell:
                    for i in range(args.num_processes):
                        h = torch.zeros(1, 2 * actor_critic.hid_size).cuda()
                        c = torch.zeros(1, 2 * actor_critic.hid_size).cuda()
                        start_ind = max(hs_ind[i],step+1-args.lenhs)
                        for ind in range(start_ind,step+1):
                            h,c=hs(rollouts.feat[ind,i].unsqueeze(0),h,c)
                        hs_info[i,:]=h.view(1,2*actor_critic.hid_size)
                        del h,c
                        gc.collect()
                else:
                    for i in range(args.num_processes):
                        start_ind = max(hs_ind[i], step + 1 - args.lenhs)
                        hs_info[i,:]=hs(rollouts.feat[start_ind:step+1,i])

                hidden_feat=actor_critic.cat(rollouts.feat[step],hs_info)
                value, action, action_log_prob, states = actor_critic.act(
                        hidden_feat,
                        rollouts.states[step])
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(cpu_actions)
            for info in infos:
                maybeepinfo = info.get('episode')
                if maybeepinfo:
                    epinfobuf.extend([maybeepinfo['r']])
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            hs_ind = ((1-masks)*(step+1)+masks*hs_ind.float()).int()

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(current_obs, hs_ind,states.data, action.data, action_log_prob.data, value.data, reward, masks)
        with torch.no_grad():
            rollouts.feat[-1] = actor_critic.get_feat(rollouts.observations[-1])
            if args.use_cell:
                for i in range(args.num_processes):
                    h = torch.zeros(1, 2 * actor_critic.hid_size).cuda()
                    c = torch.zeros(1, 2 * actor_critic.hid_size).cuda()
                    start = max(hs_ind[i], step + 1 - args.lenhs)
                    for ind in range(start, step + 1):
                        h, c = hs(rollouts.feat[ind, i].unsqueeze(0), h, c)
                    hs_info[i, :] = h.view(1, 2 * actor_critic.hid_size)
                    del h,c
            else:
                for i in range(args.num_processes):
                    start_ind = max(hs_ind[i], step + 1 - args.lenhs)
                    hs_info[i, :] = hs(rollouts.feat[start_ind:step + 1, i])
            hidden_feat = actor_critic.cat(rollouts.feat[-1],hs_info)
            next_value = actor_critic.get_value(hidden_feat).detach()
        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)
        rollouts.compute_ft_ind()

        print('begin update, time  {}'.format(time.strftime("%Hh %Mm %Ss",
                                     time.gmtime(time.time() - start_time))))
        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        print('end update, time  {}'.format(time.strftime("%Hh %Mm %Ss",
                                                            time.gmtime(time.time() - start_time))))
        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [save_model,
                            hasattr(envs, 'ob_rms') and envs.ob_rms or None]

            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            v_mean,v_median,v_min,v_max = safe(epinfobuf)
            print("Updates {}, num timesteps {},time {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(j, total_num_steps,
                       time.strftime("%Hh %Mm %Ss",
                                     time.gmtime(time.time() - start_time)),
                       int(total_num_steps / (end - start_time)),
                       v_mean, v_median, v_min, v_max,
                       dist_entropy,
                       value_loss, action_loss))

            if not (v_mean==np.nan):
                rec_x.append(total_num_steps)
                rec_y.append(v_mean)
                file.write(str(total_num_steps))
                file.write(' ')
                file.writelines(str(v_mean))
                file.write('\n')

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
    plot_line(rec_x, rec_y, './imgs/' + args.env_name + '_' + args.method_name + '.png', args.method_name,
              args.env_name, args.num_frames)
    file.close()
Esempio n. 24
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    # T choose whetehr to visualize
    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = SubprocVecEnv([
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ])
    # T get shape of observation array of the environment
    obs_shape = envs.observation_space.shape
    # T adjusting the shape; not sure what the * is
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    #T initialize the actor critic; MLP and CNN classes imported from model.py
    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space)
    else:
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    #T - some kind of setup with the actor_critic
    if args.finetune:
        checkpoint_path = save_path = os.path.join(args.save_dir, args.algo,
                                                   args.checkpoint)
        state_dict = torch.load(checkpoint_path)
        print("Finetuning from checkpoint: %s, at step: %d" %
              (checkpoint_path, state_dict['update']))
        actor_critic.load_state_dict(state_dict['model_state_dict'])
        keep_layers = [
            'v_fc3.weight', 'v_fc3.bias', 'a_fc2.weight', 'a_fc2.bias',
            'dist.fc_mean.weight', 'dist.fc_mean.bias', 'dist.logstd._bias'
        ]
        for name, param in actor_critic.named_parameters():
            if name not in keep_layers:
                param.requires_grad = False
        for name, param in actor_critic.named_parameters():
            print('Param name: %s, requires_grad: %d' %
                  (name, param.requires_grad))
    # T set up dimensions of the action space
    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]
    # T all arguments imported from arguments.py
    # T enable cuda pythorch tensor support
    if args.cuda:
        actor_critic.cuda()

    # T - pull arguments and choose algorithm and optimizer
    if args.algo == 'a2c':
        optimizer = optim.RMSprop(filter(lambda p: p.requires_grad,
                                         actor_critic.parameters()),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(),
                               args.lr,
                               eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    #TO-DO figure out how to restore optimizer parameters when freezing some weights
    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space)
    # return all zeros, so nothing observed
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    # T-not sure what this function is doing??
    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    # T - reset the environment; call function to update observation
    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    # T - initialize rewards to be zero
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    if args.algo == 'ppo':
        old_model = copy.deepcopy(actor_critic)

    start = time.time()
    # T - begin iterative loop
    for j in range(num_updates):
        # T - take steps through single instance
        # T - this is the loop where action/critic happens
        for step in range(args.num_steps):
            # Sample actions
            # T - buried by the action method ultimately comes from torch.nn.Module
            value, action = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # T done bool returned by steps; indicates if failure occurred (done)

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks
            #T - now update the observation matrix
            update_current_obs(obs)
            #T - store what happened in this step
            rollouts.insert(step, current_obs, action.data, value.data, reward,
                            masks)

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True))[0].data

        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.observations[:-1].view(
                -1, *obs_shape))

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(
                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                Variable(rollouts.actions.view(-1, action_shape)))

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()

            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values -
                                   Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)

            optimizer.step()
        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)

            old_model.load_state_dict(actor_critic.state_dict())

            for _ in range(args.ppo_epoch):
                sampler = BatchSampler(SubsetRandomSampler(
                    range(args.num_processes * args.num_steps)),
                                       args.batch_size * args.num_processes,
                                       drop_last=False)
                for indices in sampler:
                    indices = torch.LongTensor(indices)
                    if args.cuda:
                        indices = indices.cuda()
                    observations_batch = rollouts.observations[:-1].view(
                        -1, *obs_shape)[indices]
                    actions_batch = rollouts.actions.view(
                        -1, action_shape)[indices]
                    return_batch = rollouts.returns[:-1].view(-1, 1)[indices]

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(
                        Variable(observations_batch), Variable(actions_batch))

                    _, old_action_log_probs, _ = old_model.evaluate_actions(
                        Variable(observations_batch, volatile=True),
                        Variable(actions_batch, volatile=True))

                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs.data))
                    adv_targ = Variable(advantages.view(-1, 1)[indices])
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) -
                                  values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss -
                     dist_entropy * args.entropy_coef).backward()
                    optimizer.step()

        rollouts.observations[0].copy_(rollouts.observations[-1])

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            file_name = FILE_PREFIX + '.pt'
            #torch.save(save_model, os.path.join(save_path, file_name))
            data = {
                'update': j,
                'model_state_dict': save_model.state_dict(),
                'optim_state_dict': optimizer.state_dict()
            }
            torch.save(data, os.path.join(save_path, file_name))
        # T - write out some log information (not important for us)
        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        -dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
            except IOError:
                pass
Esempio n. 25
0
def main():
    print("######")
    print("HELLO! Returns start with infinity values")
    print("######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.random_task:
        env_params = {
            'wt': np.round(np.random.uniform(0.5, 1.0), 2),
            'x': np.round(np.random.uniform(-0.1, 0.1), 2),
            'y': np.round(np.random.uniform(-0.1, 0.1), 2),
            'z': np.round(np.random.uniform(0.15, 0.2), 2),
        }
    else:
        env_params = {
            'wt': args.euclidean_weight,
            'x': args.goal_x,
            'y': args.goal_y,
            'z': args.goal_z,
        }
    envs = [make_env(args.env_name, args.seed, i, args.log_dir, **env_params)
            for i in range(args.num_processes)]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    envs = VecNormalize(envs, ob=False)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    actor_critic.input_norm.update(rollouts.observations[0])

    last_return = -np.inf
    best_return = -np.inf
    best_models = None

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(Variable(rollouts.observations[step], volatile=True),
                                                                      Variable(rollouts.states[step], volatile=True),
                                                                      Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks)
            actor_critic.input_norm.update(rollouts.observations[step + 1])

        next_value = actor_critic(Variable(rollouts.observations[-1], volatile=True),
                                  Variable(rollouts.states[-1], volatile=True),
                                  Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                                                                                           Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                                                                                           Variable(rollouts.masks[:-1].view(-1, 1)),
                                                                                           Variable(rollouts.actions.view(-1, action_shape)))

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) * action_log_probs).mean()

            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

            optimizer.step()
        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)

            for e in range(args.ppo_epoch):
                if args.recurrent_policy:
                    data_generator = rollouts.recurrent_generator(advantages,
                                                            args.num_mini_batch)
                else:
                    data_generator = rollouts.feed_forward_generator(advantages,
                                                            args.num_mini_batch)

                for sample in data_generator:
                    observations_batch, states_batch, actions_batch, \
                       return_batch, masks_batch, old_action_log_probs_batch, \
                            adv_targ = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(observations_batch),
                                                                                                   Variable(states_batch),
                                                                                                   Variable(masks_batch),
                                                                                                   Variable(actions_batch))

                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) - values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss - dist_entropy * args.entropy_coef).backward()
                    nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)
                    optimizer.step()

        rollouts.after_update()

        if args.vis and j % args.vis_interval == 0:
            last_return = plot(logger, args.log_dir)

        if last_return > best_return:
            best_return = last_return
            try:
                os.makedirs(os.path.dirname(args.save_path))
            except OSError:
                pass

            info = {
                'return': best_return,
                'reward_norm': np.sqrt(envs.ret_rms.var + envs.epsilon)
            }

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            torch.save((save_model, env_params, info), args.save_path)

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print("Updates {}, num timesteps {}, FPS {}, average return {:.5f}, best_return {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                  format(j, total_num_steps,
                         int(total_num_steps / (end - start)),
                         last_return, best_return,
                         value_loss.data[0], action_loss.data[0]))
Esempio n. 26
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]
                 )  # I guess the obs_shape[0] is channel number

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space,
                                 args.recurrent_policy)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(),
                               args.lr,
                               eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # args.num_steps should be the length of interactions before each updating/training
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy(
            )  # returns are state value, sampled action, act_log_prob, hidden states

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(
                step, current_obs, states.data, action.data,
                action_log_prob.data, value.data, reward, masks
            )  # so the rollout stores one batch of interaction sequences, each sequence has length of args.num_steps

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                Variable(rollouts.masks[:-1].view(-1, 1)),
                Variable(rollouts.actions.view(-1, action_shape)))
            # values should be values of observations, states are the hidden states used in rnn module, by pwang8

            values = values.view(
                args.num_steps, args.num_processes,
                1)  # values are estimated current state values
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            # rollouts.returns are current "Action" value calculted following Bellmans' eqaution gamma * State_value(t+1) + reward(t)
            advantages = Variable(
                rollouts.returns[:-1]
            ) - values  # This is also the definition of advantage value (action_value - state_value).
            value_loss = advantages.pow(
                2).mean()  # values are estimated current state_value(t)

            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()

            # If ACKTR is utilized, it is not only a different optimizer is used, they also added some new loss source
            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(
                    values - Variable(sample_values.data)
                ).pow(2).mean(
                )  # don't know what is the difference between this and just randomly sample some noise

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)

            optimizer.step()
        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:
                                                                      -1]  # calculating the advantage value of an action
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)

            # The difference from this ppo optimization to the optimization above is that: it updates params for
            # multiple epochs in ppo optimization. Because of this, it samples from the rollouts storage a minibatch
            # every time to calculate gradient. Sampling is conducted for optimization purpose.
            for e in range(args.ppo_epoch):
                if args.recurrent_policy:
                    data_generator = rollouts.recurrent_generator(
                        advantages, args.num_mini_batch)
                else:
                    data_generator = rollouts.feed_forward_generator(
                        advantages, args.num_mini_batch)

                for sample in data_generator:
                    observations_batch, states_batch, actions_batch, \
                       return_batch, masks_batch, old_action_log_probs_batch, \
                            adv_targ = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                        Variable(observations_batch), Variable(states_batch),
                        Variable(masks_batch), Variable(actions_batch))
                    # For the 1st epoch of updating, I guess the action_log_probls is the same as old_action_log_probs_batch
                    # because params of the NN have not been updated at that time. But later, in other updating epochs,
                    # this ratio will generate some error. The old_action_log_probs_batch will not be updated during
                    # these param updating epochs.
                    # action_log_probs is the log prob of that action taken by the agent. So it's one value here, not
                    # log_prob for all actions with certain input observation/state. By pwang8, Dec 31, 2017
                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)
                    # compared to a2c, the major difference for ppo is that action_loss is calculated in controlled way
                    value_loss = (Variable(return_batch) -
                                  values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss -
                     dist_entropy * args.entropy_coef).backward()
                    nn.utils.clip_grad_norm(actor_critic.parameters(),
                                            args.max_grad_norm)
                    optimizer.step()

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
            except IOError:
                pass
Esempio n. 27
0
def main():
    print("#######")
    print("WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards")
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = []
        win_dic ={}
        for i in range(len(mt_env_id_dic_selected)):
            win += [None]
        win_afs_per_m = None
        win_afs_loss = None
        win_basic_loss = None
    
    plot_dic = {}
    envs = []
    ''' Because the oral program has only one game per model, so Song add loop i
        So whatever you wanna run , just put in SubprocVecEnvMt!
    '''
    for i in range(len(mt_env_id_dic_selected)):
        log_dir = args.log_dir+mt_env_id_dic_selected[i]+'/'
        for j in range(args.num_processes):
            envs += [make_env(mt_env_id_dic_selected[i], args.seed, j, log_dir)]
    ''' This envs is an intergration of all the running env'''
    envs = SubprocVecEnvMt(envs)

    num_processes_total = args.num_processes * len(mt_env_id_dic_selected)
    '''(1,128,128)'''
    obs_shape = envs.observation_space.shape
    #num_stack :number of frames to stack
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    from arguments import is_restore
    if is_restore and args.save_dir:
        load_path = os.path.join(args.save_dir, args.algo)
        actor_critic =torch.load(os.path.join(load_path, args.env_name + ".pt"))
        # print ("restored previous model!")
        # print (actor_critic.Variable)
        # print (sss)
    else:
        if len(envs.observation_space.shape) == 3:
            actor_critic = CNNPolicy(obs_shape[0], envs.action_space)
        else:
            actor_critic = MLPPolicy(obs_shape[0], envs.action_space)
    
    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)
    #'args.num_steps: number of forward steps in A2C
    #rollouts is an intergration of state\ reward\ next state\action and so on
    rollouts = RolloutStorage(args.num_steps, num_processes_total, obs_shape, envs.action_space)
    current_state = torch.zeros(num_processes_total, *obs_shape)
    ''' not sure about it'''
    def update_current_state(state):
        shape_dim0 = envs.observation_space.shape[0]
        # print (shape_dim0)
        # print (sss)
        state = torch.from_numpy(state).float()
        if args.num_stack > 1:
            current_state[:, :-shape_dim0] = current_state[:, shape_dim0:]
        current_state[:, -shape_dim0:] = state

    state = envs.reset()
    update_current_state(state)

    rollouts.states[0].copy_(current_state)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes_total, 1])
    final_rewards = torch.zeros([num_processes_total, 1])

    if args.cuda:
        current_state = current_state.cuda()
        rollouts.cuda()

    if args.algo == 'ppo':
        old_model = copy.deepcopy(actor_critic)

    from arguments import ewc, ewc_lambda, ewc_interval

    afs_per_m = []
    afs_offset = [0.0]*gtn_M

    afs_loss_list = []
    basic_loss_list = []
    episode_reward_rec = 0.0
    one = torch.FloatTensor([1]).cuda()
    mone = one * -1
    '''for one whole game '''
    for j in range(num_updates):
        for step in range(args.num_steps):
            if ewc == 1:
                try:
                    states_store = torch.cat([states_store, rollouts.states[step].clone()], 0)
                except Exception as e:
                    states_store = rollouts.states[step].clone()
            # Sample actions
            '''act fun refer to "observe it!"'''
            value, action = actor_critic.act(Variable(rollouts.states[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next state
            state, reward, done = envs.step(cpu_actions)
            '''record the last 100 episodes rewards'''
            episode_reward_rec += reward
            episode_reward_rec = rec_last_100_epi_reward(episode_reward_rec,done)
            
            
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            '''reward is shape of process_num_total, not batch-size'''
            # print ((reward).size())
            # print (done)
            # print (sss)
            episode_rewards += reward
            ################
            # rec_last_100_epi_reward(reward,done)
            
            # episode_reward_ppo += reward[0]
            # If done then clean the history of observations. final_rewards is used for compute after one whole num_step
            
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_state.dim() == 4:
                current_state *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_state *= masks

            update_current_state(state)
            rollouts.insert(step, current_state, action.data, value.data, reward, masks)

        next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data

        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.states[:-1].view(-1, *obs_shape))

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        if args.algo in ['a2c', 'acktr']:
            # reset gradient
            optimizer.zero_grad()

            # forward
            values, action_log_probs, dist_entropy, conv_list = actor_critic.evaluate_actions(Variable(rollouts.states[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape)))
            # pre-process
            values = values.view(args.num_steps, num_processes_total, 1)
            action_log_probs = action_log_probs.view(args.num_steps, num_processes_total, 1)

            # compute afs loss
            afs_per_m_temp, afs_loss = actor_critic.get_afs_per_m(
                action_log_probs=action_log_probs,
                conv_list=conv_list,
            )
            if len(afs_per_m_temp)>0:
                afs_per_m += [afs_per_m_temp]

            if (afs_loss is not None) and (afs_loss.data.cpu().numpy()[0]!=0.0):
                afs_loss.backward(mone, retain_graph=True)
                afs_loss_list += [afs_loss.data.cpu().numpy()[0]]

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) * action_log_probs).mean()
            final_loss_basic = value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef

            ewc_loss = None
            if j != 0:
                if ewc == 1:
                    ewc_loss = actor_critic.get_ewc_loss(lam=ewc_lambda)
            
            if ewc_loss is None:
                final_loss = final_loss_basic
            else:
                final_loss = final_loss_basic + ewc_loss
            # print (final_loss_basic.data.cpu().numpy()[0])
            # final_loss_basic
            basic_loss_list += [final_loss_basic.data.cpu().numpy()[0]]
            final_loss.backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

            optimizer.step()

        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)

            old_model.load_state_dict(actor_critic.state_dict())
            if hasattr(actor_critic, 'obs_filter'):
                old_model.obs_filter = actor_critic.obs_filter

            for _ in range(args.ppo_epoch):
                sampler = BatchSampler(SubsetRandomSampler(range(num_processes_total * args.num_steps)), args.batch_size * num_processes_total, drop_last=False)
                for indices in sampler:
                    indices = torch.LongTensor(indices)
                    if args.cuda:
                        indices = indices.cuda()
                    states_batch = rollouts.states[:-1].view(-1, *obs_shape)[indices]
                    actions_batch = rollouts.actions.view(-1, action_shape)[indices]
                    return_batch = rollouts.returns[:-1].view(-1, 1)[indices]

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, conv_list = actor_critic.evaluate_actions(Variable(states_batch), Variable(actions_batch))

                    _, old_action_log_probs, _, old_conv_list= old_model.evaluate_actions(Variable(states_batch, volatile=True), Variable(actions_batch, volatile=True))

                    ratio = torch.exp(action_log_probs - Variable(old_action_log_probs.data))
                    adv_targ = Variable(advantages.view(-1, 1)[indices])
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) - values).pow(2).mean()

                    optimizer.zero_grad()
                    final_loss_basic = (value_loss + action_loss - dist_entropy * args.entropy_coef)
                    
                    basic_loss_list += [final_loss_basic.data.cpu().numpy()[0]]
                    final_loss_basic.backward()
                    optimizer.step()

        rollouts.states[0].copy_(rollouts.states[-1])

        # if j % int(num_updates/2-10) == 0 and args.save_dir != "":
        if j % args.save_interval == 0 and args.save_dir != "":
         
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))
            import pickle
            with open(os.path.join(save_path, args.env_name + "_last_100_reward"), "wb") as f:
                pickle.dump(reward_dict, f)



        if j % args.log_interval == 0:
            print("Updates {}, num frames {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(j, (j + 1) * args.num_processes * args.num_steps,
                       final_rewards.mean(),
                       final_rewards.median(),
                       final_rewards.min(),
                       final_rewards.max(), -dist_entropy.data[0],
                       value_loss.data[0], action_loss.data[0]))

            try:
                print("ewc loss {:.5f}".
                format(ewc_loss.data.cpu().numpy()[0]))
            except Exception as e:
                pass
            

        if j > 5 and j % args.vis_interval == 0 and args.vis:
            ''' load from the folder'''
            for ii in range(len(mt_env_id_dic_selected)):
                log_dir = args.log_dir+mt_env_id_dic_selected[ii]+'/'
                win[ii] = visdom_plot(viz, win[ii], log_dir, mt_env_id_dic_selected[ii], args.algo)

            plot_dic = reward_dict
            for plot_name in plot_dic.keys():
                # if plot_name not in win_dic:
                # win_dic[plot_name] = None
                if plot_name in win_dic.keys():
                    if len(plot_dic[plot_name]) > 0:
                        win_dic[plot_name] = viz.line(
                            torch.from_numpy(np.asarray(plot_dic[plot_name])), 
                            win=win_dic[plot_name],
                            opts=dict(title=break_line_html(exp+'>>'+plot_name))
                        )
                    

                else:
                    win_dic[plot_name] = None
            
            if len(afs_per_m)>0:
                win_afs_per_m = viz.line(
                    torch.from_numpy(np.asarray(afs_per_m)), 
                    win=win_afs_per_m,
                    opts=dict(title=title_html+'>>afs')
                )

            # print (basic_loss_list)
            '''a2c:len(basic_loss_list) is vis_interval+1. because j start from 0
               ppo:len(basic_loss_list) is (vis_interval+1)*ppo_epoch_4*len(BatchSampler)
            '''
            
            # print (len(basic_loss_list))
            # print (ss)
            win_basic_loss = viz.line(
                torch.from_numpy(np.asarray(basic_loss_list)), 
                win=win_basic_loss,
                opts=dict(title=title_html+'>>basic_loss')
            )

            if len(afs_loss_list) > 0:
                win_afs_loss = viz.line(
                    torch.from_numpy(np.asarray(afs_loss_list)), 
                    win=win_afs_loss,
                    opts=dict(title=title_html+'>>afs_loss')
                )

        from arguments import parameter_noise, parameter_noise_interval
        if parameter_noise == 1:
            if j % parameter_noise_interval == 0:
                actor_critic.parameter_noise()

        if ewc == 1:
            if j % ewc_interval == 0 or j==0:
                actor_critic.compute_fisher(states_store)
                states_store = None
                actor_critic.star()
Esempio n. 28
0
def main():
    print("###############################################################")
    print("#################### VIZDOOM LEARNER START ####################")
    print("###############################################################")

    save_path = os.path.join(args.save_dir, "a2c")
    num_updates = int(args.num_frames) // args.num_steps // args.num_processes
    reward_name = ""
    if args.roe:
        reward_name = "_event"
    scenario_name = args.config_path.split("/")[1].split(".")[0]
    print("############### " + scenario_name + " ###############")
    log_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str(
        args.agent_id) + ".log"
    log_event_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str(
        args.agent_id) + ".eventlog"
    log_event_reward_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str(
        args.agent_id) + ".eventrewardlog"
    start_updates = 0
    start_step = 0
    best_final_rewards = -1000000.0

    os.environ['OMP_NUM_THREADS'] = '1'

    global envs
    es = [
        make_env(i, args.config_path, visual=args.visual, bots=args.bots)
        for i in range(args.num_processes)
    ]
    envs = VecEnv([es[i] for i in range(args.num_processes)])

    obs_shape = envs.observation_space_shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if args.resume:
        actor_critic = torch.load(
            os.path.join(save_path, log_file_name + ".pt"))
        filename = glob.glob(os.path.join(args.log_dir, log_file_name))[0]
        if args.roe:
            e
            # TODO: Load event buffer
        with open(filename) as file:
            lines = file.readlines()
            start_updates = (int)(lines[-1].strip().split(",")[0])
            start_steps = (int)(lines[-1].strip().split(",")[1])
            num_updates += start_updates
    else:
        if not args.debug:
            try:
                os.makedirs(args.log_dir)
            except OSError:
                files = glob.glob(os.path.join(args.log_dir, log_file_name))
                for f in files:
                    os.remove(f)
                with open(log_file_name, "w") as myfile:
                    myfile.write("")
                files = glob.glob(
                    os.path.join(args.log_dir, log_event_file_name))
                for f in files:
                    os.remove(f)
                with open(log_event_file_name, "w") as myfile:
                    myfile.write("")
                files = glob.glob(
                    os.path.join(args.log_dir, log_event_reward_file_name))
                for f in files:
                    os.remove(f)
                with open(log_event_reward_file_name, "w") as myfile:
                    myfile.write("")
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space_shape)

    action_shape = 1

    if args.cuda:
        actor_critic.cuda()

    optimizer = optim.RMSprop(actor_critic.parameters(),
                              args.lr,
                              eps=args.eps,
                              alpha=args.alpha)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space_shape)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space_shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)
    last_game_vars = []
    for i in range(args.num_processes):
        last_game_vars.append(np.zeros(args.num_events))

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])
    episode_intrinsic_rewards = torch.zeros([args.num_processes, 1])
    final_intrinsic_rewards = torch.zeros([args.num_processes, 1])
    episode_events = torch.zeros([args.num_processes, args.num_events])
    final_events = torch.zeros([args.num_processes, args.num_events])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    # Create event buffer
    if args.qd:
        event_buffer = EventBufferSQLProxy(args.num_events, args.capacity,
                                           args.exp_id, args.agent_id)
    elif not args.resume:
        event_buffer = EventBuffer(args.num_events, args.capacity)
    else:
        event_buffer = pickle.load(
            open(log_file_name + "_event_buffer_temp.p", "rb"))

    event_episode_rewards = []

    start = time.time()
    for j in np.arange(start_updates, num_updates):
        for step in range(args.num_steps):

            value, action = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()
            obs, reward, done, info, events = envs.step(cpu_actions)
            intrinsic_reward = []

            # Fix broken rewards - upscale
            for i in range(len(reward)):
                if scenario_name in ["deathmatch", "my_way_home"]:
                    reward[i] *= 100
                if scenario_name == "deadly_corridor":
                    reward[i] = 1 if events[i][2] >= 1 else 0

            for e in events:
                if args.roe:
                    intrinsic_reward.append(event_buffer.intrinsic_reward(e))
                else:
                    r = reward[len(intrinsic_reward)]
                    intrinsic_reward.append(r)

            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            intrinsic_reward = torch.from_numpy(
                np.expand_dims(np.stack(intrinsic_reward), 1)).float()
            #events = torch.from_numpy(np.expand_dims(np.stack(events), args.num_events)).float()
            events = torch.from_numpy(events).float()
            episode_rewards += reward
            episode_intrinsic_rewards += intrinsic_reward
            episode_events += events

            # Event stats
            event_rewards = []
            for ei in range(0, args.num_events):
                ev = np.zeros(args.num_events)
                ev[ei] = 1
                er = event_buffer.intrinsic_reward(ev)
                event_rewards.append(er)

            event_episode_rewards.append(event_rewards)

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_intrinsic_rewards *= masks
            final_events *= masks
            final_rewards += (1 - masks) * episode_rewards
            final_intrinsic_rewards += (1 - masks) * episode_intrinsic_rewards
            final_events += (1 - masks) * episode_events

            for i in range(args.num_processes):
                if done[i]:
                    event_buffer.record_events(np.copy(
                        final_events[i].numpy()),
                                               frame=j * args.num_steps)

            episode_rewards *= masks
            episode_intrinsic_rewards *= masks
            episode_events *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)

            rollouts.insert(step, current_obs, action.data, value.data,
                            intrinsic_reward, masks)

        final_episode_reward = np.mean(event_episode_rewards, axis=0)
        event_episode_rewards = []

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True))[0].data

        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.observations[:-1].view(
                -1, *obs_shape))

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(
            Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
            Variable(rollouts.actions.view(-1, action_shape)))

        values = values.view(args.num_steps, args.num_processes, 1)
        action_log_probs = action_log_probs.view(args.num_steps,
                                                 args.num_processes, 1)
        advantages = Variable(rollouts.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(Variable(advantages.data) * action_log_probs).mean()

        optimizer.zero_grad()
        (value_loss * args.value_loss_coef + action_loss -
         dist_entropy * args.entropy_coef).backward()

        nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

        optimizer.step()

        rollouts.observations[0].copy_(rollouts.observations[-1])

        if final_rewards.mean() > best_final_rewards and not args.debug:
            try:
                os.makedirs(save_path)
            except OSError:
                pass
            best_final_rewards = final_rewards.mean()
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            torch.save(
                save_model,
                os.path.join(save_path,
                             log_file_name.split(".log")[0] + ".pt"))

        if j % args.save_interval == 0 and args.save_dir != "" and not args.debug:
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            torch.save(save_model,
                       os.path.join(save_path, log_file_name + "_temp.pt"))
            if isinstance(event_buffer, EventBuffer):
                pickle.dump(event_buffer,
                            open(log_file_name + "_event_buffer_temp.p", "wb"))

        if j % args.log_interval == 0:

            envs.log()
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            log = "Updates {}, num timesteps {}, FPS {}, mean/max reward {:.5f}/{:.5f}, mean/max intrinsic reward {:.5f}/{:.5f}"\
                .format(j, total_num_steps,
                            int(total_num_steps / (end - start)),
                            final_rewards.mean(),
                            final_rewards.max(),
                            final_intrinsic_rewards.mean(),
                            final_intrinsic_rewards.max()
                        )
            log_to_file = "{}, {}, {:.5f}, {:.5f}, {:.5f}, {:.5f}\n" \
                .format(j, total_num_steps,
                        final_rewards.mean(),
                        final_rewards.std(),
                        final_intrinsic_rewards.mean(),
                        final_intrinsic_rewards.std())
            log_to_event_file = ','.join(
                map(str,
                    event_buffer.get_event_mean().tolist())) + "\n"
            log_to_event_reward_file = ','.join(
                map(str,
                    event_buffer.get_event_rewards().tolist())) + "\n"
            print(log)
            print(log_to_event_file)

            # Save to files
            with open(log_file_name, "a") as myfile:
                myfile.write(log_to_file)
            with open(log_event_file_name, "a") as myfile:
                myfile.write(str(total_num_steps) + "," + log_to_event_file)
            with open(log_event_reward_file_name, "a") as myfile:
                myfile.write(
                    str(total_num_steps) + "," + log_to_event_reward_file)

    envs.close()
    time.sleep(5)
Esempio n. 29
0
def main():
    print("#######")
    print("WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards")
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'





    print (args.cuda)
    print (args.num_steps)
    print (args.num_processes)
    print (args.lr)
    print (args.eps)
    print (args.alpha)
    print (args.use_gae)
    print (args.gamma)
    print (args.tau)
    print (args.value_loss_coef)
    print (args.entropy_coef)
    # fsdaf





    # Create environment
    envs = SubprocVecEnv([
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ])
    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space)
    else:
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]
    # action_shape = action_shape


    # shape_dim0 = envs.observation_space.shape[0]

    # if args.cuda:
    #     dtype = torch.cuda.FloatTensor
    # else:
    #     dtype = torch.FloatTensor

    hparams = {'cuda':args.cuda,
                'num_steps':args.num_steps,
                'num_processes':args.num_processes, 
                'obs_shape':obs_shape,
                'lr':args.lr,
                'eps':args.eps, 
                'alpha':args.alpha,
                'use_gae':args.use_gae, 
                'gamma':args.gamma, 
                'tau':args.tau,
                'value_loss_coef':args.value_loss_coef, 
                'entropy_coef':args.entropy_coef}




    # Create agent
    # agent = a2c(envs, hparams)


    # rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space)
    #it has a self.state that is [steps, processes, obs]
    #steps is used to compute expected reward

    if args.cuda:
        actor_critic.cuda()
        # rollouts.cuda()
    optimizer = optim.RMSprop(actor_critic.parameters(), hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha'])







    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space)



    # Init state

    current_state = torch.zeros(args.num_processes, *obs_shape)#.type(dtype)
    def update_current_state(state):#, shape_dim0):
        shape_dim0 = envs.observation_space.shape[0]
        state = torch.from_numpy(state).float()
        if args.num_stack > 1:
            current_state[:, :-shape_dim0] = current_state[:, shape_dim0:]
        current_state[:, -shape_dim0:] = state
        # return current_state


    state = envs.reset()

    update_current_state(state)#, shape_dim0) 
    # agent.insert_first_state(current_state)
    rollouts.states[0].copy_(current_state)
        #set the first state to current state


    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_state = current_state.cuda()#type(dtype)
        # if args.cuda:
        rollouts.cuda()
    #Begin training
    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):

            # Act
            # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            value, action = actor_critic.act(Variable(rollouts.states[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Observe reward and next state
            state, reward, done, info = envs.step(cpu_actions) # state:[nProcesss, ndims, height, width]

            # Record rewards
            # reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            episode_rewards += reward
            # If done then clean the history of observations.
            # these final rewards are only used for printing. but the mask is used in the storage, dont know why yet
            # oh its just clearing the env that finished, and resetting its episode_reward
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #if an env is done
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks
            if args.cuda:
                masks = masks.cuda()
            if current_state.dim() == 4:
                current_state *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_state *= masks
            # return reward, masks, final_rewards, episode_rewards, current_state




            # Update state
            update_current_state(state)#, shape_dim0)

            # Agent record step
            # agent.insert_data(step, current_state, action.data, value.data, reward, masks)
            rollouts.insert(step, current_state, action.data, value.data, reward, masks)



        #Optimize agent
        # agent.update()
        next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data
        # use last state to make prediction of next value



        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.states[:-1].view(-1, *obs_shape))
        #not sure what this is




        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)
        # this computes R =  r + r+ ...+ V(t)  for each step



        values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(
                                                    Variable(rollouts.states[:-1].view(-1, *obs_shape)), 
                                                    Variable(rollouts.actions.view(-1, action_shape)))
        # I think this aciton log prob could have been computed and stored earlier 
        # and didnt we already store the value prediction???

        values = values.view(args.num_steps, args.num_processes, 1)
        action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1)

        advantages = Variable(rollouts.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(Variable(advantages.data) * action_log_probs).mean()

        optimizer.zero_grad()
        (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward()

        optimizer.step()




        rollouts.states[0].copy_(rollouts.states[-1])
        # the first state is now the last state of the previous 









        # #Save model
        # if j % args.save_interval == 0 and args.save_dir != "":
        #     save_path = os.path.join(args.save_dir, args.algo)
        #     try:
        #         os.makedirs(save_path)
        #     except OSError:
        #         pass

        #     # A really ugly way to save a model to CPU
        #     save_model = actor_critic
        #     if args.cuda:
        #         save_model = copy.deepcopy(actor_critic).cpu()
        #     torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        #Print updates
        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            # print("Updates {}, n_timesteps {}, FPS {}, mean/median R {:.1f}/{:.1f}, min/max R {:.1f}/{:.1f}, T:{:.4f}".#, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
            #     format(j, total_num_steps,
            #            int(total_num_steps / (end - start)),
            #            final_rewards.mean(),
            #            final_rewards.median(),
            #            final_rewards.min(),
            #            final_rewards.max(),
            #            end - start))#, -dist_entropy.data[0],
            #            # value_loss.data[0], action_loss.data[0]))

            # print("Upts {}, n_timesteps {}, min/med/mean/max {:.1f}/{:.1f}/{:.1f}/{:.1f}, FPS {}, T:{:.1f}".
            #     format(j, total_num_steps,
            #            final_rewards.min(),
            #            final_rewards.median(),
            #            final_rewards.mean(),
            #            final_rewards.max(),
            #            int(total_num_steps / (end - start)),
            #            end - start))

            if j % (args.log_interval*30) == 0:
                print("Upts, n_timesteps, min/med/mean/max, FPS, Time")

            print("{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}".
                    format(j, total_num_steps,
                           final_rewards.min(),
                           final_rewards.median(),
                           final_rewards.mean(),
                           final_rewards.max(),
                           int(total_num_steps / (end - start)),
                           end - start))
Esempio n. 30
0
def main():
    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir,
                 args.start_container) for i in range(args.num_processes)
    ]

    test_envs = [
        make_env(args.env_name, args.seed, i, args.log_dir,
                 args.start_container) for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
        test_envs = SubprocVecEnv(test_envs)
    else:
        envs = DummyVecEnv(envs)
        test_envs = DummyVecEnv(test_envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if args.saved_encoder_model:
        obs_shape = (args.num_stack, args.latent_space_size)

    obs_numel = reduce(operator.mul, obs_shape, 1)

    if len(obs_shape) == 3 and obs_numel > 1024:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space,
                                 args.recurrent_policy)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_numel, envs.action_space)

    modelSize = 0
    for p in actor_critic.parameters():
        pSize = reduce(operator.mul, p.size(), 1)
        modelSize += pSize
    print(str(actor_critic))
    print('Total model size: %d' % modelSize)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.resume_experiment:
        print("\n############## Loading saved model ##############\n")
        actor_critic, ob_rms = torch.load(
            os.path.join(save_path, args.env_name + args.save_tag + ".pt"))
        tr.load(os.path.join(log_path, args.env_name + args.save_tag + ".p"))

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(),
                               args.lr,
                               eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    print(obs_shape)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    rollouts_test = RolloutStorage(args.num_steps_test, args.num_processes,
                                   obs_shape, envs.action_space,
                                   actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)
    current_obs_test = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs, test=False):
        shape_dim0 = envs.observation_space.shape[0]
        if args.saved_encoder_model:
            shape_dim0 = 1
            obs, _ = vae.encode(Variable(torch.cuda.FloatTensor(obs)))
            obs = obs.data.cpu().numpy()
        obs = torch.from_numpy(obs).float()
        if not test:
            if args.num_stack > 1:
                current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
            current_obs[:, -shape_dim0:] = obs
        else:
            if args.num_stack > 1:
                current_obs_test[:, :
                                 -shape_dim0] = current_obs_test[:,
                                                                 shape_dim0:]
            current_obs_test[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)
    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])
    reward_avg = 0

    if args.cuda:
        current_obs = current_obs.cuda()
        current_obs_test = current_obs_test.cuda()
        rollouts.cuda()
        rollouts_test.cuda()

    start = time.time()

    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Observation, reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)

            # Maxime: clip the reward within [0,1] for more reliable training
            # This code deals poorly with large reward values
            reward = np.clip(reward, a_min=0, a_max=None) / 400

            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            tr.episodes_done += args.num_processes - masks.sum()

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, value.data, reward, masks)

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)
        tr.iterations_done += 1

        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                Variable(rollouts.masks[:-1].view(-1, 1)),
                Variable(rollouts.actions.view(-1, action_shape)))

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()

            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values -
                                   Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)

            optimizer.step()

        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)

            for e in range(args.ppo_epoch):
                if args.recurrent_policy:
                    data_generator = rollouts.recurrent_generator(
                        advantages, args.num_mini_batch)
                else:
                    data_generator = rollouts.feed_forward_generator(
                        advantages, args.num_mini_batch)

                for sample in data_generator:
                    observations_batch, states_batch, actions_batch, \
                       return_batch, masks_batch, old_action_log_probs_batch, \
                            adv_targ = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                        Variable(observations_batch), Variable(states_batch),
                        Variable(masks_batch), Variable(actions_batch))

                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) -
                                  values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss -
                     dist_entropy * args.entropy_coef).backward()
                    nn.utils.clip_grad_norm(actor_critic.parameters(),
                                            args.max_grad_norm)
                    optimizer.step()

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(
                save_model,
                os.path.join(save_path, args.env_name + args.save_tag + ".pt"))

            total_test_reward_list = []
            step_test_list = []

            for _ in range(args.num_tests):
                test_obs = test_envs.reset()
                update_current_obs(test_obs, test=True)
                rollouts_test.observations[0].copy_(current_obs_test)
                step_test = 0
                total_test_reward = 0

                while step_test < args.num_steps_test:
                    value_test, action_test, action_log_prob_test, states_test = actor_critic.act(
                        Variable(rollouts_test.observations[step_test],
                                 volatile=True),
                        Variable(rollouts_test.states[step_test],
                                 volatile=True),
                        Variable(rollouts_test.masks[step_test],
                                 volatile=True))
                    cpu_actions_test = action_test.data.squeeze(
                        1).cpu().numpy()

                    # Observation, reward and next obs
                    obs_test, reward_test, done_test, info_test = test_envs.step(
                        cpu_actions_test)

                    # masks here doesn't really matter, but still
                    masks_test = torch.FloatTensor(
                        [[0.0] if done_test_ else [1.0]
                         for done_test_ in done_test])

                    # Maxime: clip the reward within [0,1] for more reliable training
                    # This code deals poorly with large reward values
                    reward_test = np.clip(reward_test, a_min=0,
                                          a_max=None) / 400

                    total_test_reward += reward_test[0]
                    reward_test = torch.from_numpy(
                        np.expand_dims(np.stack(reward_test), 1)).float()

                    update_current_obs(obs_test)
                    rollouts_test.insert(step_test, current_obs_test, states_test.data, action_test.data, action_log_prob_test.data,\
                     value_test.data, reward_test, masks_test)

                    step_test += 1

                    if done_test:
                        break

                #rollouts_test.reset() # Need to reinitialise with .cuda(); don't forget
                total_test_reward_list.append(total_test_reward)
                step_test_list.append(step_test)

            append_to(tr.test_reward, tr,
                      sum(total_test_reward_list) / args.num_tests)
            append_to(tr.test_episode_len, tr,
                      sum(step_test_list) / args.num_tests)

            logger.log_scalar_rl(
                "test_reward", tr.test_reward[0], args.sliding_wsize,
                [tr.episodes_done, tr.global_steps_done, tr.iterations_done])
            logger.log_scalar_rl(
                "test_episode_len", tr.test_episode_len[0], args.sliding_wsize,
                [tr.episodes_done, tr.global_steps_done, tr.iterations_done])

            # Saving all the MyContainer variables
            tr.save(
                os.path.join(log_path, args.env_name + args.save_tag + ".p"))

        if j % args.log_interval == 0:
            reward_avg = 0.99 * reward_avg + 0.01 * final_rewards.mean()
            end = time.time()
            tr.global_steps_done = (j +
                                    1) * args.num_processes * args.num_steps

            print(
                "Updates {}, num timesteps {}, FPS {}, running avg reward {:.3f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, tr.global_steps_done,
                        int(tr.global_steps_done / (end - start)), reward_avg,
                        dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))

            append_to(tr.pg_loss, tr, action_loss.data[0])
            append_to(tr.val_loss, tr, value_loss.data[0])
            append_to(tr.entropy_loss, tr, dist_entropy.data[0])
            append_to(tr.train_reward_avg, tr, reward_avg)

            logger.log_scalar_rl(
                "train_pg_loss", tr.pg_loss[0], args.sliding_wsize,
                [tr.episodes_done, tr.global_steps_done, tr.iterations_done])
            logger.log_scalar_rl(
                "train_val_loss", tr.val_loss[0], args.sliding_wsize,
                [tr.episodes_done, tr.global_steps_done, tr.iterations_done])
            logger.log_scalar_rl(
                "train_entropy_loss", tr.entropy_loss[0], args.sliding_wsize,
                [tr.episodes_done, tr.global_steps_done, tr.iterations_done])
            logger.log_scalar_rl(
                "train_reward_avg", tr.train_reward_avg[0], args.sliding_wsize,
                [tr.episodes_done, tr.global_steps_done, tr.iterations_done])
            """
            print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(
                    j,
                    total_num_steps,
                    int(total_num_steps / (end - start)),
                    final_rewards.mean(),
                    final_rewards.median(),
                    final_rewards.min(),
                    final_rewards.max(), dist_entropy.data[0],
                    value_loss.data[0], action_loss.data[0])
                )
            """

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
            except IOError:
                pass