Ejemplo n.º 1
0
    def __init__(self, hparams):

        self.obs_shape = hparams['obs_shape']
        self.n_actions = hparams['n_actions']

        self.use_gae = hparams['use_gae']
        self.gamma = hparams['gamma']
        self.tau = hparams['tau']

        self.num_steps = hparams['num_steps']
        self.num_processes = hparams['num_processes']
        self.value_loss_coef = hparams['value_loss_coef']
        self.entropy_coef = hparams['entropy_coef']
        self.cuda = hparams['cuda']
        self.opt = hparams['opt']
        self.grad_clip = hparams['grad_clip']

        self.actor_critic = CNNPolicy(self.obs_shape[0],
                                      self.n_actions)  #.cuda()

        # Storing rollouts
        self.rollouts = RolloutStorage(self.num_steps, self.num_processes,
                                       self.obs_shape, self.n_actions)

        # if self.cuda:
        self.actor_critic.cuda()
        self.rollouts.cuda()

        self.optimizer = optim.Adam(params=self.actor_critic.parameters(),
                                    lr=hparams['lr'],
                                    eps=hparams['eps'])

        self.hparams = hparams
Ejemplo n.º 2
0
    def __init__(self, hparams):

        self.use_gae = hparams['use_gae']
        self.gamma = hparams['gamma']
        self.tau = hparams['tau']

        self.obs_shape = hparams['obs_shape']
        self.num_steps = hparams['num_steps']
        self.num_processes = hparams['num_processes']
        self.value_loss_coef = hparams['value_loss_coef']
        self.entropy_coef = hparams['entropy_coef']
        self.cuda = hparams['cuda']
        self.opt = hparams['opt']
        self.grad_clip = hparams['grad_clip']

        self.next_state_pred_ = hparams['next_state_pred_']

        # Policy and Value network
        if 'traj_action_mask' in hparams and hparams['traj_action_mask']:
            self.actor_critic = CNNPolicy_trajectory_action_mask(
                self.obs_shape[0], hparams['action_space'])
        else:
            self.actor_critic = CNNPolicy(self.obs_shape[0],
                                          hparams['action_space'])

        # Storing rollouts
        self.rollouts = RolloutStorage(self.num_steps, self.num_processes,
                                       self.obs_shape, hparams['action_space'])

        if self.cuda:
            self.actor_critic.cuda()
            self.rollouts.cuda()

        #Optimizer
        if self.opt == 'rms':
            self.optimizer = optim.RMSprop(
                params=self.actor_critic.parameters(),
                lr=hparams['lr'],
                eps=hparams['eps'],
                alpha=hparams['alpha'])
        elif self.opt == 'adam':
            self.optimizer = optim.Adam(params=self.actor_critic.parameters(),
                                        lr=hparams['lr'],
                                        eps=hparams['eps'])
        elif self.opt == 'sgd':
            self.optimizer = optim.SGD(params=self.actor_critic.parameters(),
                                       lr=hparams['lr'],
                                       momentum=hparams['mom'])
        else:
            print('no opt specified')

        self.action_shape = 1

        if hparams['gif_'] or hparams['ls_'] or hparams['vae_'] or hparams[
                'grad_var_']:
            self.rollouts_list = RolloutStorage_list()

        self.hparams = hparams
Ejemplo n.º 3
0
    def __init__(self, hparams):

        self.obs_shape = hparams['obs_shape']
        self.n_actions = hparams['n_actions']

        self.use_gae = hparams['use_gae']
        self.gamma = hparams['gamma']
        self.tau = hparams['tau']

        self.num_steps = hparams['num_steps']
        self.num_processes = hparams['num_processes']
        self.value_loss_coef = hparams['value_loss_coef']
        self.entropy_coef = hparams['entropy_coef']
        self.cuda = hparams['cuda']
        self.opt = hparams['opt']
        self.grad_clip = hparams['grad_clip']





        self.actor_critic = CNNPolicy(self.obs_shape[0], self.n_actions) #.cuda()

        # Storing rollouts
        self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, self.n_actions)

        # if self.cuda:
        self.actor_critic.cuda()
        self.rollouts.cuda()

        self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'])

        self.hparams = hparams






print ('\nInit Policies')
# agent = a2c(model_dict)
# param_file = home+'/Documents/tmp/breakout_2frames/BreakoutNoFrameskip-v4/A2C/seed0/model_params/model_params9999360.pt'
# load_policy = 1
policies = []
policies_dir = home+'/Documents/tmp/multiple_seeds_of_policies/BreakoutNoFrameskip-v4/A2C/'
for f in os.listdir(policies_dir):
    print (f)
    policy = CNNPolicy(2, 4) #.cuda()
    param_file = home+'/Documents/tmp/multiple_seeds_of_policies/BreakoutNoFrameskip-v4/A2C/'+f+'/model_params3/model_params9999360.pt'    
    param_dict = torch.load(param_file)

    policy.load_state_dict(param_dict)
    # policy = torch.load(param_file).cuda()
    print ('loaded params', param_file)
    policy.cuda()

    policies.append(policy)

    #just one for now
#     break

# policy = policies[0]
Ejemplo n.º 5
0
    def __init__(self, hparams):

        self.obs_shape = hparams['obs_shape']
        self.n_actions = hparams['n_actions']

        self.actor_critic = CNNPolicy(self.obs_shape[0], self.n_actions).cuda()
Ejemplo n.º 6
0
class a2c(object):
    def __init__(self, hparams):

        self.use_gae = hparams['use_gae']
        self.gamma = hparams['gamma']
        self.tau = hparams['tau']

        self.obs_shape = hparams['obs_shape']
        self.num_steps = hparams['num_steps']
        self.num_processes = hparams['num_processes']
        self.value_loss_coef = hparams['value_loss_coef']
        self.entropy_coef = hparams['entropy_coef']
        self.cuda = hparams['cuda']
        self.opt = hparams['opt']
        self.grad_clip = hparams['grad_clip']

        self.next_state_pred_ = hparams['next_state_pred_']

        # Policy and Value network
        if 'traj_action_mask' in hparams and hparams['traj_action_mask']:
            self.actor_critic = CNNPolicy_trajectory_action_mask(
                self.obs_shape[0], hparams['action_space'])
        else:
            self.actor_critic = CNNPolicy(self.obs_shape[0],
                                          hparams['action_space'])

        # Storing rollouts
        self.rollouts = RolloutStorage(self.num_steps, self.num_processes,
                                       self.obs_shape, hparams['action_space'])

        if self.cuda:
            self.actor_critic.cuda()
            self.rollouts.cuda()

        #Optimizer
        if self.opt == 'rms':
            self.optimizer = optim.RMSprop(
                params=self.actor_critic.parameters(),
                lr=hparams['lr'],
                eps=hparams['eps'],
                alpha=hparams['alpha'])
        elif self.opt == 'adam':
            self.optimizer = optim.Adam(params=self.actor_critic.parameters(),
                                        lr=hparams['lr'],
                                        eps=hparams['eps'])
        elif self.opt == 'sgd':
            self.optimizer = optim.SGD(params=self.actor_critic.parameters(),
                                       lr=hparams['lr'],
                                       momentum=hparams['mom'])
        else:
            print('no opt specified')

        self.action_shape = 1

        if hparams['gif_'] or hparams['ls_'] or hparams['vae_'] or hparams[
                'grad_var_']:
            self.rollouts_list = RolloutStorage_list()

        self.hparams = hparams

    def act(self, current_state):

        # value, action = self.actor_critic.act(current_state)
        # [] [] [P,1] [P]
        value, action, action_log_probs, dist_entropy = self.actor_critic.act(
            current_state)

        return value, action, action_log_probs, dist_entropy

    def insert_first_state(self, current_state):

        self.rollouts.states[0].copy_(current_state)

    def insert_data(self, step, current_state, action, value, reward, masks,
                    action_log_probs, dist_entropy,
                    next_state_pred):  #, done):

        self.rollouts.insert(step, current_state, action, value, reward, masks,
                             action_log_probs, dist_entropy)
        # self.rollouts.insert_state_pred(next_state_pred)

        if 'traj_action_mask' in self.hparams and self.hparams[
                'traj_action_mask']:
            self.actor_critic.reset_mask(done)

    def update(self):

        next_value = self.actor_critic(
            Variable(self.rollouts.states[-1], volatile=True))[0].data

        self.rollouts.compute_returns(next_value, self.use_gae, self.gamma,
                                      self.tau)

        values = torch.cat(self.rollouts.value_preds,
                           0).view(self.num_steps, self.num_processes, 1)
        action_log_probs = torch.cat(self.rollouts.action_log_probs).view(
            self.num_steps, self.num_processes, 1)
        dist_entropy = torch.cat(self.rollouts.dist_entropy).view(
            self.num_steps, self.num_processes, 1)

        self.rollouts.value_preds = []
        self.rollouts.action_log_probs = []
        self.rollouts.dist_entropy = []
        self.rollouts.state_preds = []

        advantages = Variable(self.rollouts.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(advantages.detach() * action_log_probs).mean()
        cost = action_loss + value_loss * self.value_loss_coef - dist_entropy.mean(
        ) * self.entropy_coef  #*10.

        self.optimizer.zero_grad()
        cost.backward()

        nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip)

        self.optimizer.step()
Ejemplo n.º 7
0
    def __init__(self, envs, hparams):

        self.use_gae = hparams['use_gae']
        self.gamma = hparams['gamma']
        self.tau = hparams['tau']

        self.obs_shape = hparams['obs_shape']
        self.num_steps = hparams['num_steps']
        self.num_processes = hparams['num_processes']
        self.value_loss_coef = hparams['value_loss_coef']
        self.entropy_coef = hparams['entropy_coef']
        self.cuda = hparams['cuda']
        self.opt = hparams['opt']
        self.grad_clip = hparams['grad_clip']

        # Policy and Value network

        # if hparams['dropout'] == True:
        #     print ('CNNPolicy_dropout2')
        #     self.actor_critic = CNNPolicy_dropout2(self.obs_shape[0], envs.action_space)
        # elif len(envs.observation_space.shape) == 3:
        #     print ('CNNPolicy2')
        #     self.actor_critic = CNNPolicy2(self.obs_shape[0], envs.action_space)
        # else:
        #     self.actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space)

        if 'traj_action_mask' in hparams and hparams['traj_action_mask']:
            self.actor_critic = CNNPolicy_trajectory_action_mask(
                self.obs_shape[0], envs.action_space)
        else:
            self.actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space)

        # Storing rollouts
        self.rollouts = RolloutStorage(self.num_steps, self.num_processes,
                                       self.obs_shape, envs.action_space)

        if self.cuda:
            self.actor_critic.cuda()
            self.rollouts.cuda()

        #Optimizer
        if self.opt == 'rms':
            self.optimizer = optim.RMSprop(
                params=self.actor_critic.parameters(),
                lr=hparams['lr'],
                eps=hparams['eps'],
                alpha=hparams['alpha'])
        elif self.opt == 'adam':
            self.optimizer = optim.Adam(params=self.actor_critic.parameters(),
                                        lr=hparams['lr'],
                                        eps=hparams['eps'])
        elif self.opt == 'sgd':
            self.optimizer = optim.SGD(params=self.actor_critic.parameters(),
                                       lr=hparams['lr'],
                                       momentum=hparams['mom'])
        else:
            print('no opt specified')

        # if envs.action_space.__class__.__name__ == "Discrete":
        #     action_shape = 1
        # else:
        #     action_shape = envs.action_space.shape[0]
        # self.action_shape = action_shape
        self.action_shape = 1
        # if __:
        #     self.deterministic_action = 0
        # else:
        #     self.deterministic_action = 0
        if hparams['gif_'] or hparams['ls_']:
            self.rollouts_list = RolloutStorage_list()

        self.hparams = hparams
Ejemplo n.º 8
0
class a2c(object):
    def __init__(self, envs, hparams):

        self.use_gae = hparams['use_gae']
        self.gamma = hparams['gamma']
        self.tau = hparams['tau']

        self.obs_shape = hparams['obs_shape']
        self.num_steps = hparams['num_steps']
        self.num_processes = hparams['num_processes']
        self.value_loss_coef = hparams['value_loss_coef']
        self.entropy_coef = hparams['entropy_coef']
        self.cuda = hparams['cuda']
        self.opt = hparams['opt']
        self.grad_clip = hparams['grad_clip']

        # Policy and Value network

        # if hparams['dropout'] == True:
        #     print ('CNNPolicy_dropout2')
        #     self.actor_critic = CNNPolicy_dropout2(self.obs_shape[0], envs.action_space)
        # elif len(envs.observation_space.shape) == 3:
        #     print ('CNNPolicy2')
        #     self.actor_critic = CNNPolicy2(self.obs_shape[0], envs.action_space)
        # else:
        #     self.actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space)

        if 'traj_action_mask' in hparams and hparams['traj_action_mask']:
            self.actor_critic = CNNPolicy_trajectory_action_mask(
                self.obs_shape[0], envs.action_space)
        else:
            self.actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space)

        # #for batch norm
        # self.actor_critic.train() #self.actor_critic.eval()

        # Storing rollouts
        self.rollouts = RolloutStorage(self.num_steps, self.num_processes,
                                       self.obs_shape, envs.action_space)

        if self.cuda:
            self.actor_critic.cuda()
            self.rollouts.cuda()

        #Optimizer
        if self.opt == 'rms':
            self.optimizer = optim.RMSprop(
                params=self.actor_critic.parameters(),
                lr=hparams['lr'],
                eps=hparams['eps'],
                alpha=hparams['alpha'])
        elif self.opt == 'adam':
            self.optimizer = optim.Adam(params=self.actor_critic.parameters(),
                                        lr=hparams['lr'],
                                        eps=hparams['eps'])
        elif self.opt == 'sgd':
            self.optimizer = optim.SGD(params=self.actor_critic.parameters(),
                                       lr=hparams['lr'],
                                       momentum=hparams['mom'])
        else:
            print('no opt specified')

        # if envs.action_space.__class__.__name__ == "Discrete":
        #     action_shape = 1
        # else:
        #     action_shape = envs.action_space.shape[0]
        # self.action_shape = action_shape
        self.action_shape = 1
        # if __:
        #     self.deterministic_action = 0
        # else:
        #     self.deterministic_action = 0
        if hparams['gif_'] or hparams['ls_']:
            self.rollouts_list = RolloutStorage_list()

        self.hparams = hparams

    def act(self, current_state):

        # value, action = self.actor_critic.act(current_state)
        # [] [] [P,1] [P]

        # print ('aaa')
        # print (self.actor_critic.act(current_state))
        value, action, action_log_probs, dist_entropy = self.actor_critic.act(
            current_state)
        # print ('lll')

        return value, action, action_log_probs, dist_entropy

    def insert_first_state(self, current_state):

        self.rollouts.states[0].copy_(current_state)

    def insert_data(self, step, current_state, action, value, reward, masks,
                    action_log_probs, dist_entropy):  #, done):

        self.rollouts.insert(step, current_state, action, value, reward, masks,
                             action_log_probs, dist_entropy)

        if 'traj_action_mask' in self.hparams and self.hparams[
                'traj_action_mask']:
            self.actor_critic.reset_mask(done)

    def update(self):

        next_value = self.actor_critic(
            Variable(self.rollouts.states[-1], volatile=True))[0].data

        self.rollouts.compute_returns(next_value, self.use_gae, self.gamma,
                                      self.tau)

        # values, action_log_probs, dist_entropy = self.actor_critic.evaluate_actions(
        #                                             Variable(self.rollouts.states[:-1].view(-1, *self.obs_shape)),
        #                                             Variable(self.rollouts.actions.view(-1, self.action_shape)))

        values = torch.cat(self.rollouts.value_preds,
                           0).view(self.num_steps, self.num_processes, 1)
        action_log_probs = torch.cat(self.rollouts.action_log_probs).view(
            self.num_steps, self.num_processes, 1)
        dist_entropy = torch.cat(self.rollouts.dist_entropy).view(
            self.num_steps, self.num_processes, 1)

        self.rollouts.value_preds = []
        self.rollouts.action_log_probs = []
        self.rollouts.dist_entropy = []

        advantages = Variable(self.rollouts.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(Variable(advantages.data) * action_log_probs).mean()

        self.optimizer.zero_grad()
        cost = action_loss + value_loss * self.value_loss_coef - dist_entropy.mean(
        ) * self.entropy_coef
        cost.backward()

        nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip)

        self.optimizer.step()
Ejemplo n.º 9
0
    def __init__(self, hparams):

        self.obs_shape = hparams['obs_shape']
        self.n_actions = hparams['n_actions']

        self.actor_critic = CNNPolicy(self.obs_shape[0], self.n_actions).cuda()
Ejemplo n.º 10
0
    #load experiemetn dict
    print ('load experiment dict')
    dict_location = exp_dir + '/' +env_name+ 'NoFrameskip-v4/A2C/seed0/model_dict.json'
    with open(dict_location, 'r') as outfile:
        exp_dict = json.load(outfile)







    #Init policy , not agent
    print ('init policy')
    policy = CNNPolicy(2*3, 18)   #frames*channels, action size

    #load params
    # param_file = exp_dir + '/' +env_name+ 'NoFrameskip-v4/A2C/seed0/model_params3/model_params2000000.pt'    
    param_file = exp_dir + '/' +env_name+ 'NoFrameskip-v4/A2C/seed0/model_params3/model_params3999840.pt'    
    param_dict = torch.load(param_file)

    policy.load_state_dict(param_dict)
    # policy = torch.load(param_file).cuda()
    print ('loaded params', param_file)
    policy.cuda()




Ejemplo n.º 11
0
class a2c(object):
    def __init__(self, hparams):

        self.obs_shape = hparams['obs_shape']
        self.n_actions = hparams['n_actions']

        self.use_gae = hparams['use_gae']
        self.gamma = hparams['gamma']
        self.tau = hparams['tau']

        self.num_steps = hparams['num_steps']
        self.num_processes = hparams['num_processes']
        self.value_loss_coef = hparams['value_loss_coef']
        self.entropy_coef = hparams['entropy_coef']
        self.cuda = hparams['cuda']
        self.opt = hparams['opt']
        self.grad_clip = hparams['grad_clip']

        self.actor_critic = CNNPolicy(self.obs_shape[0],
                                      self.n_actions)  #.cuda()

        # Storing rollouts
        self.rollouts = RolloutStorage(self.num_steps, self.num_processes,
                                       self.obs_shape, self.n_actions)

        # if self.cuda:
        self.actor_critic.cuda()
        self.rollouts.cuda()

        self.optimizer = optim.Adam(params=self.actor_critic.parameters(),
                                    lr=hparams['lr'],
                                    eps=hparams['eps'])

        self.hparams = hparams

    def act(self, current_state):

        # value, action = self.actor_critic.act(current_state)
        # [] [] [P,1] [P]
        value, action, action_log_probs, dist_entropy = self.actor_critic.act(
            current_state)

        return value, action, action_log_probs, dist_entropy

    def insert_first_state(self, current_state):

        self.rollouts.states[0].copy_(current_state)

    def insert_data(self, step, current_state, action, value, reward, masks,
                    action_log_probs, dist_entropy):  #, done):

        self.rollouts.insert(step, current_state, action, value, reward, masks,
                             action_log_probs, dist_entropy)

        # if 'traj_action_mask' in self.hparams and self.hparams['traj_action_mask']:
        #     self.actor_critic.reset_mask(done)

    def update(self):

        next_value = self.actor_critic(
            Variable(self.rollouts.states[-1], volatile=True))[0].data

        self.rollouts.compute_returns(next_value, self.use_gae, self.gamma,
                                      self.tau)

        # values, action_log_probs, dist_entropy = self.actor_critic.evaluate_actions(
        #                                             Variable(self.rollouts.states[:-1].view(-1, *self.obs_shape)),
        #                                             Variable(self.rollouts.actions.view(-1, self.action_shape)))

        values = torch.cat(self.rollouts.value_preds,
                           0).view(self.num_steps, self.num_processes, 1)
        action_log_probs = torch.cat(self.rollouts.action_log_probs).view(
            self.num_steps, self.num_processes, 1)
        dist_entropy = torch.cat(self.rollouts.dist_entropy).view(
            self.num_steps, self.num_processes, 1)

        self.rollouts.value_preds = []
        self.rollouts.action_log_probs = []
        self.rollouts.dist_entropy = []

        advantages = Variable(self.rollouts.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(Variable(advantages.data) * action_log_probs).mean()

        self.optimizer.zero_grad()
        cost = action_loss + value_loss * self.value_loss_coef - dist_entropy.mean(
        ) * self.entropy_coef
        cost.backward()

        nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip)

        self.optimizer.step()

    def no_update(self):

        next_value = self.actor_critic(
            Variable(self.rollouts.states[-1], volatile=True))[0].data

        self.rollouts.compute_returns(next_value, self.use_gae, self.gamma,
                                      self.tau)

        # values, action_log_probs, dist_entropy = self.actor_critic.evaluate_actions(
        #                                             Variable(self.rollouts.states[:-1].view(-1, *self.obs_shape)),
        #                                             Variable(self.rollouts.actions.view(-1, self.action_shape)))

        values = torch.cat(self.rollouts.value_preds,
                           0).view(self.num_steps, self.num_processes, 1)
        action_log_probs = torch.cat(self.rollouts.action_log_probs).view(
            self.num_steps, self.num_processes, 1)
        dist_entropy = torch.cat(self.rollouts.dist_entropy).view(
            self.num_steps, self.num_processes, 1)

        self.rollouts.value_preds = []
        self.rollouts.action_log_probs = []
        self.rollouts.dist_entropy = []

        advantages = Variable(self.rollouts.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(Variable(advantages.data) * action_log_probs).mean()

        self.optimizer.zero_grad()
        cost = action_loss + value_loss * self.value_loss_coef - dist_entropy.mean(
        ) * self.entropy_coef
        cost.backward()

        nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip)

        # self.optimizer.step()

        self.optimizer.zero_grad()
Ejemplo n.º 12
0
    def __init__(self, hparams):

        self.use_gae = hparams['use_gae']
        self.gamma = hparams['gamma']
        self.tau = hparams['tau']

        self.obs_shape = hparams['obs_shape']
        self.num_steps = hparams['num_steps']
        self.num_processes = hparams['num_processes']
        self.value_loss_coef = hparams['value_loss_coef']
        self.entropy_coef = hparams['entropy_coef']
        self.cuda = hparams['cuda']
        self.opt = hparams['opt']
        self.grad_clip = hparams['grad_clip']

        self.action_size = hparams['action_size']




        # Policy and Value network

        # if hparams['dropout'] == True:
        #     print ('CNNPolicy_dropout2')
        #     self.actor_critic = CNNPolicy_dropout2(self.obs_shape[0], envs.action_space)
        # elif len(envs.observation_space.shape) == 3:
        #     print ('CNNPolicy2')
        #     self.actor_critic = CNNPolicy2(self.obs_shape[0], envs.action_space)
        # else:
        #     self.actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space)

        # if 'traj_action_mask' in hparams and hparams['traj_action_mask']:
        #     self.actor_critic = CNNPolicy_trajectory_action_mask(self.obs_shape[0], envs.action_space)
        # else:

        # self.actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space)
        self.actor_critic = CNNPolicy(self.obs_shape[0], self.action_size)

        # #for batch norm
        # self.actor_critic.train() #self.actor_critic.eval()

        # Storing rollouts
        self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, self.action_size)


        if self.cuda:
            self.actor_critic.cuda()
            self.rollouts.cuda()


        #Optimizer
        if self.opt == 'rms':
            self.optimizer = optim.RMSprop(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha'])
        elif self.opt == 'adam':
            self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'])
        elif self.opt == 'sgd':
            self.optimizer = optim.SGD(params=self.actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom'])
        else:
            print ('no opt specified')



        # if envs.action_space.__class__.__name__ == "Discrete":
        #     action_shape = 1
        # else:
        #     action_shape = envs.action_space.shape[0]
        # self.action_shape = action_shape

        # self.action_shape = 1


        # # if __:
        # #     self.deterministic_action = 0
        # # else:
        # #     self.deterministic_action = 0
        # if hparams['gif_'] or hparams['ls_']:
        #     self.rollouts_list = RolloutStorage_list()

        self.hparams = hparams
Ejemplo n.º 13
0
class a2c(object):




    def __init__(self, hparams):

        self.obs_shape = hparams['obs_shape']
        self.n_actions = hparams['n_actions']

        self.use_gae = hparams['use_gae']
        self.gamma = hparams['gamma']
        self.tau = hparams['tau']

        self.num_steps = hparams['num_steps']
        self.num_processes = hparams['num_processes']
        self.value_loss_coef = hparams['value_loss_coef']
        self.entropy_coef = hparams['entropy_coef']
        self.cuda = hparams['cuda']
        self.opt = hparams['opt']
        self.grad_clip = hparams['grad_clip']





        self.actor_critic = CNNPolicy(self.obs_shape[0], self.n_actions) #.cuda()

        # Storing rollouts
        self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, self.n_actions)

        # if self.cuda:
        self.actor_critic.cuda()
        self.rollouts.cuda()

        self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'])

        self.hparams = hparams













    def act(self, current_state):

        # value, action = self.actor_critic.act(current_state)
        # [] [] [P,1] [P]
        value, action, action_log_probs, dist_entropy = self.actor_critic.act(current_state)

        return value, action, action_log_probs, dist_entropy


    def insert_first_state(self, current_state):

        self.rollouts.states[0].copy_(current_state)


    def insert_data(self, step, current_state, action, value, reward, masks, action_log_probs, dist_entropy):#, done):

        self.rollouts.insert(step, current_state, action, value, reward, masks, action_log_probs, dist_entropy)

        # if 'traj_action_mask' in self.hparams and self.hparams['traj_action_mask']:
        #     self.actor_critic.reset_mask(done)


    def update(self):
        

        next_value = self.actor_critic(Variable(self.rollouts.states[-1], volatile=True))[0].data

        self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau)

        # values, action_log_probs, dist_entropy = self.actor_critic.evaluate_actions(
        #                                             Variable(self.rollouts.states[:-1].view(-1, *self.obs_shape)), 
        #                                             Variable(self.rollouts.actions.view(-1, self.action_shape)))


        values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) 
        action_log_probs = torch.cat(self.rollouts.action_log_probs).view(self.num_steps, self.num_processes, 1)
        dist_entropy = torch.cat(self.rollouts.dist_entropy).view(self.num_steps, self.num_processes, 1)


        self.rollouts.value_preds = []
        self.rollouts.action_log_probs = []
        self.rollouts.dist_entropy = []

        advantages = Variable(self.rollouts.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(Variable(advantages.data) * action_log_probs).mean()

        self.optimizer.zero_grad()
        cost = action_loss + value_loss*self.value_loss_coef - dist_entropy.mean()*self.entropy_coef
        cost.backward()

        nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip)

        self.optimizer.step()








    def no_update(self):
        

        next_value = self.actor_critic(Variable(self.rollouts.states[-1], volatile=True))[0].data

        self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau)

        # values, action_log_probs, dist_entropy = self.actor_critic.evaluate_actions(
        #                                             Variable(self.rollouts.states[:-1].view(-1, *self.obs_shape)), 
        #                                             Variable(self.rollouts.actions.view(-1, self.action_shape)))


        values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) 
        action_log_probs = torch.cat(self.rollouts.action_log_probs).view(self.num_steps, self.num_processes, 1)
        dist_entropy = torch.cat(self.rollouts.dist_entropy).view(self.num_steps, self.num_processes, 1)


        self.rollouts.value_preds = []
        self.rollouts.action_log_probs = []
        self.rollouts.dist_entropy = []

        advantages = Variable(self.rollouts.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(Variable(advantages.data) * action_log_probs).mean()

        self.optimizer.zero_grad()
        cost = action_loss + value_loss*self.value_loss_coef - dist_entropy.mean()*self.entropy_coef
        cost.backward()

        nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip)

        # self.optimizer.step()



        self.optimizer.zero_grad()
Ejemplo n.º 14
0
state_dataset = []
for i in range(len(dataset)):
    for t in range(len(dataset[i])):
        state_dataset.append(dataset[i][t][1])  #  /255.)

print(len(state_dataset))

print('\nInit Policies')
# agent = a2c(model_dict)
# param_file = home+'/Documents/tmp/breakout_2frames/BreakoutNoFrameskip-v4/A2C/seed0/model_params/model_params9999360.pt'
# load_policy = 1
policies = []
policies_dir = home + '/Documents/tmp/multiple_seeds_of_policies/BreakoutNoFrameskip-v4/A2C/'
for f in os.listdir(policies_dir):
    print(f)
    policy = CNNPolicy(2, 4)  #.cuda()
    param_file = home + '/Documents/tmp/multiple_seeds_of_policies/BreakoutNoFrameskip-v4/A2C/' + f + '/model_params3/model_params9999360.pt'
    param_dict = torch.load(param_file)

    policy.load_state_dict(param_dict)
    # policy = torch.load(param_file).cuda()
    print('loaded params', param_file)
    policy.cuda()

    policies.append(policy)

    #just one for now
    break

policy = policies[0]
Ejemplo n.º 15
0
print(len(dataset))
print(len(dataset[ii][0]))  # single timepoint
print(dataset[ii][0][0].shape)  #action [1]           a_t+1
print(dataset[ii][0][1].shape)  #state [2,84,84]   s_t

state_dataset = []
for i in range(len(dataset)):
    for t in range(len(dataset[i])):
        state_dataset.append(dataset[i][t][1])

print(len(state_dataset))

print('Init Policy')

policy = CNNPolicy(2, 4)  #.cuda()
# agent = a2c(model_dict)
# param_file = home+'/Documents/tmp/breakout_2frames/BreakoutNoFrameskip-v4/A2C/seed0/model_params/model_params9999360.pt'

load_policy = 1

if load_policy:
    param_file = home + '/Documents/tmp/breakout_2frames_leakyrelu2/BreakoutNoFrameskip-v4/A2C/seed0/model_params3/model_params3999840.pt'

    param_dict = torch.load(param_file)

    # print (param_dict.keys())
    # for key in param_dict.keys():
    #     print (param_dict[key].size())

    # print (policy.state_dict().keys())
Ejemplo n.º 16
0
# dataset: trajectories: timesteps: (action,state) state: [2,84,84]

print(len(dataset))
print(len(dataset[ii][0]))  # single timepoint
print(dataset[ii][0][0].shape)  #action [1]           a_t+1
print(dataset[ii][0][1].shape)  #state [2,84,84]   s_t

state_dataset = []
for i in range(len(dataset)):
    for t in range(len(dataset[i])):
        state_dataset.append(dataset[i][t][1])  #  /255.)

print(len(state_dataset))

print('Init Expert Policy')
expert_policy = CNNPolicy(2, 4)  #.cuda()
# agent = a2c(model_dict)
# param_file = home+'/Documents/tmp/breakout_2frames/BreakoutNoFrameskip-v4/A2C/seed0/model_params/model_params9999360.pt'
load_policy = 1

if load_policy:
    # param_file = home+'/Documents/tmp/breakout_2frames_leakyrelu2/BreakoutNoFrameskip-v4/A2C/seed0/model_params3/model_params3999840.pt'
    param_file = home + '/Documents/tmp/breakout_2frames_leakyrelu2/BreakoutNoFrameskip-v4/A2C/seed0/model_params3/model_params9999360.pt'
    param_dict = torch.load(param_file)

    # print (param_dict.keys())
    # for key in param_dict.keys():
    #     print (param_dict[key].size())

    # print (policy.state_dict().keys())
    # for key in policy.state_dict().keys():
Ejemplo n.º 17
0


print ('\nInit Policies')


# agent = a2c(model_dict)
# param_file = home+'/Documents/tmp/breakout_2frames/BreakoutNoFrameskip-v4/A2C/seed0/model_params/model_params9999360.pt'

# load_policy = 1

policies = []
policies_dir = home+'/Documents/tmp/multiple_seeds_of_policies/BreakoutNoFrameskip-v4/A2C/'
for f in os.listdir(policies_dir):
    print (f)
    policy = CNNPolicy(2, 4) #.cuda()
    param_file = home+'/Documents/tmp/multiple_seeds_of_policies/BreakoutNoFrameskip-v4/A2C/'+f+'/model_params3/model_params9999360.pt'    
    param_dict = torch.load(param_file)

    policy.load_state_dict(param_dict)
    # policy = torch.load(param_file).cuda()
    print ('loaded params', param_file)
    policy.cuda()


    policies.append(policy)

    #just one for now
    break

Ejemplo n.º 18
0
state_dataset = []
for i in range(len(dataset)):
    for t in range(len(dataset[i])):
        state_dataset.append(dataset[i][t][1])  #  /255.)

print(len(state_dataset))

print('\nInit Policies')
# agent = a2c(model_dict)
# param_file = home+'/Documents/tmp/breakout_2frames/BreakoutNoFrameskip-v4/A2C/seed0/model_params/model_params9999360.pt'
# load_policy = 1
policies = []
policies_dir = home + '/Documents/tmp/multiple_seeds_of_policies/BreakoutNoFrameskip-v4/A2C/'
for f in os.listdir(policies_dir):
    print(f)
    policy = CNNPolicy(2, 4)  #.cuda()
    param_file = home + '/Documents/tmp/multiple_seeds_of_policies/BreakoutNoFrameskip-v4/A2C/' + f + '/model_params3/model_params9999360.pt'
    param_dict = torch.load(param_file)

    policy.load_state_dict(param_dict)
    # policy = torch.load(param_file).cuda()
    print('loaded params', param_file)
    policy.cuda()

    policies.append(policy)

    #just one for now
    break

policy = policies[0]
Ejemplo n.º 19
0
class a2c(object):




    def __init__(self, hparams):

        self.obs_shape = hparams['obs_shape']
        self.n_actions = hparams['n_actions']

        self.actor_critic = CNNPolicy(self.obs_shape[0], self.n_actions).cuda()



        # self.use_gae = hparams['use_gae']
        # self.gamma = hparams['gamma']
        # self.tau = hparams['tau']

        
        # self.num_steps = hparams['num_steps']
        # self.num_processes = hparams['num_processes']
        # self.value_loss_coef = hparams['value_loss_coef']
        # self.entropy_coef = hparams['entropy_coef']
        # self.cuda = hparams['cuda']
        # self.opt = hparams['opt']
        # self.grad_clip = hparams['grad_clip']



        # # Policy and Value network

        # # if hparams['dropout'] == True:
        # #     print ('CNNPolicy_dropout2')
        # #     self.actor_critic = CNNPolicy_dropout2(self.obs_shape[0], envs.action_space)
        # # elif len(envs.observation_space.shape) == 3:
        # #     print ('CNNPolicy2')
        # #     self.actor_critic = CNNPolicy2(self.obs_shape[0], envs.action_space)
        # # else:
        # #     self.actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space)

        # if 'traj_action_mask' in hparams and hparams['traj_action_mask']:
        #     self.actor_critic = CNNPolicy_trajectory_action_mask(self.obs_shape[0], hparams['n_actions'])
        # else:
        #     self.actor_critic = CNNPolicy(self.obs_shape[0], hparams['n_actions'])



        # # Storing rollouts
        # self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, hparams['n_actions'])


        # if self.cuda:
        #     self.actor_critic.cuda()
        #     self.rollouts.cuda()


        # #Optimizer
        # if self.opt == 'rms':
        #     self.optimizer = optim.RMSprop(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha'])
        # elif self.opt == 'adam':
        #     self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'])
        # elif self.opt == 'sgd':
        #     self.optimizer = optim.SGD(params=self.actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom'])
        # else:
        #     print ('no opt specified')



        # # if envs.action_space.__class__.__name__ == "Discrete":
        # #     action_shape = 1
        # # else:
        # #     action_shape = envs.action_space.shape[0]
        # # self.action_shape = action_shape
        # self.action_shape = 1
        # # if __:
        # #     self.deterministic_action = 0
        # # else:
        # #     self.deterministic_action = 0
        # if hparams['gif_'] or hparams['ls_']:
        #     self.rollouts_list = RolloutStorage_list()

        # self.hparams = hparams







    # def __init__(self, hparams):

    #     self.use_gae = hparams['use_gae']
    #     self.gamma = hparams['gamma']
    #     self.tau = hparams['tau']

    #     self.obs_shape = hparams['obs_shape']
    #     self.num_steps = hparams['num_steps']
    #     self.num_processes = hparams['num_processes']
    #     self.value_loss_coef = hparams['value_loss_coef']
    #     self.entropy_coef = hparams['entropy_coef']
    #     self.cuda = hparams['cuda']
    #     self.opt = hparams['opt']
    #     self.grad_clip = hparams['grad_clip']



    #     # Policy and Value network

    #     # if hparams['dropout'] == True:
    #     #     print ('CNNPolicy_dropout2')
    #     #     self.actor_critic = CNNPolicy_dropout2(self.obs_shape[0], envs.action_space)
    #     # elif len(envs.observation_space.shape) == 3:
    #     #     print ('CNNPolicy2')
    #     #     self.actor_critic = CNNPolicy2(self.obs_shape[0], envs.action_space)
    #     # else:
    #     #     self.actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space)

    #     if 'traj_action_mask' in hparams and hparams['traj_action_mask']:
    #         self.actor_critic = CNNPolicy_trajectory_action_mask(self.obs_shape[0], hparams['n_actions'])
    #     else:
    #         self.actor_critic = CNNPolicy(self.obs_shape[0], hparams['n_actions'])



    #     # Storing rollouts
    #     self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, hparams['n_actions'])


    #     if self.cuda:
    #         self.actor_critic.cuda()
    #         self.rollouts.cuda()


    #     #Optimizer
    #     if self.opt == 'rms':
    #         self.optimizer = optim.RMSprop(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha'])
    #     elif self.opt == 'adam':
    #         self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'])
    #     elif self.opt == 'sgd':
    #         self.optimizer = optim.SGD(params=self.actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom'])
    #     else:
    #         print ('no opt specified')



    #     # if envs.action_space.__class__.__name__ == "Discrete":
    #     #     action_shape = 1
    #     # else:
    #     #     action_shape = envs.action_space.shape[0]
    #     # self.action_shape = action_shape
    #     self.action_shape = 1
    #     # if __:
    #     #     self.deterministic_action = 0
    #     # else:
    #     #     self.deterministic_action = 0
    #     if hparams['gif_'] or hparams['ls_']:
    #         self.rollouts_list = RolloutStorage_list()

    #     self.hparams = hparams






    # def __init__(self, envs, hparams):

    #     self.use_gae = hparams['use_gae']
    #     self.gamma = hparams['gamma']
    #     self.tau = hparams['tau']

    #     self.obs_shape = hparams['obs_shape']
    #     self.num_steps = hparams['num_steps']
    #     self.num_processes = hparams['num_processes']
    #     self.value_loss_coef = hparams['value_loss_coef']
    #     self.entropy_coef = hparams['entropy_coef']
    #     self.cuda = hparams['cuda']
    #     self.opt = hparams['opt']
    #     self.grad_clip = hparams['grad_clip']



    #     # Policy and Value network

    #     # if hparams['dropout'] == True:
    #     #     print ('CNNPolicy_dropout2')
    #     #     self.actor_critic = CNNPolicy_dropout2(self.obs_shape[0], envs.action_space)
    #     # elif len(envs.observation_space.shape) == 3:
    #     #     print ('CNNPolicy2')
    #     #     self.actor_critic = CNNPolicy2(self.obs_shape[0], envs.action_space)
    #     # else:
    #     #     self.actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space)

    #     if 'traj_action_mask' in hparams and hparams['traj_action_mask']:
    #         self.actor_critic = CNNPolicy_trajectory_action_mask(self.obs_shape[0], envs.action_space)
    #     else:
    #         self.actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space)



    #     # Storing rollouts
    #     self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space)


    #     if self.cuda:
    #         self.actor_critic.cuda()
    #         self.rollouts.cuda()


    #     #Optimizer
    #     if self.opt == 'rms':
    #         self.optimizer = optim.RMSprop(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha'])
    #     elif self.opt == 'adam':
    #         self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'])
    #     elif self.opt == 'sgd':
    #         self.optimizer = optim.SGD(params=self.actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom'])
    #     else:
    #         print ('no opt specified')



    #     # if envs.action_space.__class__.__name__ == "Discrete":
    #     #     action_shape = 1
    #     # else:
    #     #     action_shape = envs.action_space.shape[0]
    #     # self.action_shape = action_shape
    #     self.action_shape = 1
    #     # if __:
    #     #     self.deterministic_action = 0
    #     # else:
    #     #     self.deterministic_action = 0
    #     if hparams['gif_'] or hparams['ls_']:
    #         self.rollouts_list = RolloutStorage_list()

    #     self.hparams = hparams



    def act(self, current_state):

        # value, action = self.actor_critic.act(current_state)
        # [] [] [P,1] [P]
        value, action, action_log_probs, dist_entropy = self.actor_critic.act(current_state)

        return value, action, action_log_probs, dist_entropy


    def insert_first_state(self, current_state):

        self.rollouts.states[0].copy_(current_state)


    def insert_data(self, step, current_state, action, value, reward, masks, action_log_probs, dist_entropy):#, done):

        self.rollouts.insert(step, current_state, action, value, reward, masks, action_log_probs, dist_entropy)

        if 'traj_action_mask' in self.hparams and self.hparams['traj_action_mask']:
            self.actor_critic.reset_mask(done)


    def update(self):
        

        next_value = self.actor_critic(Variable(self.rollouts.states[-1], volatile=True))[0].data

        self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau)

        # values, action_log_probs, dist_entropy = self.actor_critic.evaluate_actions(
        #                                             Variable(self.rollouts.states[:-1].view(-1, *self.obs_shape)), 
        #                                             Variable(self.rollouts.actions.view(-1, self.action_shape)))


        values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) 
        action_log_probs = torch.cat(self.rollouts.action_log_probs).view(self.num_steps, self.num_processes, 1)
        dist_entropy = torch.cat(self.rollouts.dist_entropy).view(self.num_steps, self.num_processes, 1)


        self.rollouts.value_preds = []
        self.rollouts.action_log_probs = []
        self.rollouts.dist_entropy = []

        advantages = Variable(self.rollouts.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(Variable(advantages.data) * action_log_probs).mean()

        self.optimizer.zero_grad()
        cost = action_loss + value_loss*self.value_loss_coef - dist_entropy.mean()*self.entropy_coef
        cost.backward()

        nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip)

        self.optimizer.step()
Ejemplo n.º 20
0
class a2c(object):
    def __init__(self, envs, hparams):

        self.use_gae = hparams['use_gae']
        self.gamma = hparams['gamma']
        self.tau = hparams['tau']

        self.obs_shape = hparams['obs_shape']
        self.num_steps = hparams['num_steps']
        self.num_processes = hparams['num_processes']
        self.value_loss_coef = hparams['value_loss_coef']
        self.entropy_coef = hparams['entropy_coef']
        self.cuda = hparams['cuda']
        self.opt = hparams['opt']
        self.grad_clip = hparams['grad_clip']

        self.next_state_pred_ = hparams['next_state_pred_']

        # Policy and Value network

        # if hparams['dropout'] == True:
        #     print ('CNNPolicy_dropout2')
        #     self.actor_critic = CNNPolicy_dropout2(self.obs_shape[0], envs.action_space)
        # elif len(envs.observation_space.shape) == 3:
        #     print ('CNNPolicy2')
        #     self.actor_critic = CNNPolicy2(self.obs_shape[0], envs.action_space)
        # else:
        #     self.actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space)

        if 'traj_action_mask' in hparams and hparams['traj_action_mask']:
            self.actor_critic = CNNPolicy_trajectory_action_mask(
                self.obs_shape[0], envs.action_space)
        else:
            self.actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space)

        # Storing rollouts
        self.rollouts = RolloutStorage(self.num_steps, self.num_processes,
                                       self.obs_shape, envs.action_space)

        if self.cuda:
            self.actor_critic.cuda()
            self.rollouts.cuda()

        #Optimizer
        if self.opt == 'rms':
            self.optimizer = optim.RMSprop(
                params=self.actor_critic.parameters(),
                lr=hparams['lr'],
                eps=hparams['eps'],
                alpha=hparams['alpha'])
        elif self.opt == 'adam':
            self.optimizer = optim.Adam(params=self.actor_critic.parameters(),
                                        lr=hparams['lr'],
                                        eps=hparams['eps'])
        elif self.opt == 'sgd':
            self.optimizer = optim.SGD(params=self.actor_critic.parameters(),
                                       lr=hparams['lr'],
                                       momentum=hparams['mom'])
        else:
            print('no opt specified')

        # if envs.action_space.__class__.__name__ == "Discrete":
        #     action_shape = 1
        # else:
        #     action_shape = envs.action_space.shape[0]
        # self.action_shape = action_shape
        self.action_shape = 1
        # if __:
        #     self.deterministic_action = 0
        # else:
        #     self.deterministic_action = 0
        if hparams['gif_'] or hparams['ls_'] or hparams['vae_']:
            self.rollouts_list = RolloutStorage_list()

        self.hparams = hparams

    def act(self, current_state):

        # value, action = self.actor_critic.act(current_state)
        # [] [] [P,1] [P]
        value, action, action_log_probs, dist_entropy = self.actor_critic.act(
            current_state)

        return value, action, action_log_probs, dist_entropy

    def insert_first_state(self, current_state):

        self.rollouts.states[0].copy_(current_state)

    def insert_data(self, step, current_state, action, value, reward, masks,
                    action_log_probs, dist_entropy,
                    next_state_pred):  #, done):

        self.rollouts.insert(step, current_state, action, value, reward, masks,
                             action_log_probs, dist_entropy)

        # self.rollouts.insert_state_pred(next_state_pred)

        if 'traj_action_mask' in self.hparams and self.hparams[
                'traj_action_mask']:
            self.actor_critic.reset_mask(done)

    def update(self):

        next_value = self.actor_critic(
            Variable(self.rollouts.states[-1], volatile=True))[0].data

        self.rollouts.compute_returns(next_value, self.use_gae, self.gamma,
                                      self.tau)

        # values, action_log_probs, dist_entropy = self.actor_critic.evaluate_actions(
        #                                             Variable(self.rollouts.states[:-1].view(-1, *self.obs_shape)),
        #                                             Variable(self.rollouts.actions.view(-1, self.action_shape)))

        values = torch.cat(self.rollouts.value_preds,
                           0).view(self.num_steps, self.num_processes, 1)
        action_log_probs = torch.cat(self.rollouts.action_log_probs).view(
            self.num_steps, self.num_processes, 1)
        dist_entropy = torch.cat(self.rollouts.dist_entropy).view(
            self.num_steps, self.num_processes, 1)

        # print (len(self.rollouts.state_preds))

        if self.next_state_pred_:

            state_preds = torch.cat(self.rollouts.state_preds).view(
                self.num_steps, self.num_processes, 1, 84, 84)

            real_states = self.rollouts.states[1:]  #[Steps, P, stack, 84,84]
            real_states = real_states[:, :, -1].contiguous().view(
                self.num_steps, self.num_processes, 1, 84,
                84)  #[Steps, P, 1, 84,84]

            self.state_pred_error = (state_preds -
                                     Variable(real_states)).pow(2).mean()
            # self.state_pred_error = Variable(torch.zeros(1)).mean().cuda()

            state_pred_error_value = self.state_pred_error.detach()

        # print (real_states.size())
        # fafd
        # print (self.num_steps)
        # print (state_preds.size())

        # fada

        self.rollouts.value_preds = []
        self.rollouts.action_log_probs = []
        self.rollouts.dist_entropy = []
        self.rollouts.state_preds = []

        # print (state_preds)
        # print (real_states)

        # print (state_pred_error)

        advantages = Variable(self.rollouts.returns[:-1]) - values

        # advantages = values - Variable(self.rollouts.returns[:-1])

        value_loss = advantages.pow(2).mean()

        if self.next_state_pred_:
            action_loss = -(
                (Variable(advantages.data) + state_pred_error_value * .0001) *
                action_log_probs).mean()
            cost = action_loss + value_loss * self.value_loss_coef - dist_entropy.mean(
            ) * self.entropy_coef + .0001 * self.state_pred_error
            fasdfa

        else:

            # adv = torch.clamp(Variable(advantages.data), min= -10, max=10)
            # action_loss = - (adv* action_log_probs).mean() #could just do detach instead of data
            # action_loss = (Variable(self.rollouts.returns[:-1]).detach() * action_log_probs).mean()

            action_loss = -(advantages.detach() * action_log_probs).mean()
            cost = action_loss + value_loss * self.value_loss_coef - dist_entropy.mean(
            ) * self.entropy_coef  #*10.

        self.optimizer.zero_grad()
        cost.backward()

        nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip)

        self.optimizer.step()
print(len(state_dataset))

# fdsfds

print('\nInit Policies')
# agent = a2c(model_dict)
# param_file = home+'/Documents/tmp/breakout_2frames/BreakoutNoFrameskip-v4/A2C/seed0/model_params/model_params9999360.pt'
# load_policy = 1
policies = []
# policies_dir = home+'/Documents/tmp/multiple_seeds_of_policies/BreakoutNoFrameskip-v4/A2C/'
policies_dir = home + '/Documents/tmp/RoadRunner/RoadRunnerNoFrameskip-v4/A2C/'
for f in os.listdir(policies_dir):
    print(f)
    # policy = CNNPolicy(2, 4) #.cuda()
    policy = CNNPolicy(2, 18)  #.cuda()   #num-frames, nyum-actions

    # param_file = home+'/Documents/tmp/multiple_seeds_of_policies/BreakoutNoFrameskip-v4/A2C/'+f+'/model_params3/model_params9999360.pt'
    param_file = home + '/Documents/tmp/RoadRunner/RoadRunnerNoFrameskip-v4/A2C/' + f + '/model_params3/model_params9999360.pt'
    param_dict = torch.load(param_file)

    policy.load_state_dict(param_dict)
    # policy = torch.load(param_file).cuda()
    print('loaded params', param_file)
    policy.cuda()

    policies.append(policy)

    #just one for now
    break
Ejemplo n.º 22
0
print (len(dataset[ii][0])) # single timepoint
print (dataset[ii][0][0].shape)  #action [1]           a_t+1
print (dataset[ii][0][1].shape)     #state [2,84,84]   s_t


state_dataset = []
for i in range(len(dataset)):
    for t in range(len(dataset[i])):
        state_dataset.append(dataset[i][t][1]) #  /255.)

print (len(state_dataset))



print ('Init Expert Policy')
expert_policy = CNNPolicy(2, 4) #.cuda()
# agent = a2c(model_dict)
# param_file = home+'/Documents/tmp/breakout_2frames/BreakoutNoFrameskip-v4/A2C/seed0/model_params/model_params9999360.pt'
load_policy = 1

if load_policy:
    # param_file = home+'/Documents/tmp/breakout_2frames_leakyrelu2/BreakoutNoFrameskip-v4/A2C/seed0/model_params3/model_params3999840.pt'
    param_file = home+'/Documents/tmp/breakout_2frames_leakyrelu2/BreakoutNoFrameskip-v4/A2C/seed0/model_params3/model_params9999360.pt'    
    param_dict = torch.load(param_file)

    # print (param_dict.keys())
    # for key in param_dict.keys():
    #     print (param_dict[key].size())

    # print (policy.state_dict().keys())
    # for key in policy.state_dict().keys():
Ejemplo n.º 23
0
        state_dataset.append(dataset[i][t][1])  #  /255.)

print(len(state_dataset))

print('\nInit Policies')

# agent = a2c(model_dict)
# param_file = home+'/Documents/tmp/breakout_2frames/BreakoutNoFrameskip-v4/A2C/seed0/model_params/model_params9999360.pt'

# load_policy = 1

policies = []
policies_dir = home + '/Documents/tmp/multiple_seeds_of_policies/BreakoutNoFrameskip-v4/A2C/'
for f in os.listdir(policies_dir):
    print(f)
    policy = CNNPolicy(2, 4)  #.cuda()
    param_file = home + '/Documents/tmp/multiple_seeds_of_policies/BreakoutNoFrameskip-v4/A2C/' + f + '/model_params3/model_params9999360.pt'
    param_dict = torch.load(param_file)

    policy.load_state_dict(param_dict)
    # policy = torch.load(param_file).cuda()
    print('loaded params', param_file)
    policy.cuda()

    policies.append(policy)

    #just one for now
    break

# if load_policy:
# param_file = home+'/Documents/tmp/breakout_2frames_leakyrelu2/BreakoutNoFrameskip-v4/A2C/seed0/model_params3/model_params3999840.pt'
Ejemplo n.º 24
0






print ('\nInit Policies')
# agent = a2c(model_dict)
# param_file = home+'/Documents/tmp/breakout_2frames/BreakoutNoFrameskip-v4/A2C/seed0/model_params/model_params9999360.pt'
# load_policy = 1
policies = []
policies_dir = home+'/Documents/tmp/multiple_seeds_of_policies/BreakoutNoFrameskip-v4/A2C/'
for f in os.listdir(policies_dir):
    print (f)
    policy = CNNPolicy(2, 4) #.cuda()
    param_file = home+'/Documents/tmp/multiple_seeds_of_policies/BreakoutNoFrameskip-v4/A2C/'+f+'/model_params3/model_params9999360.pt'    
    param_dict = torch.load(param_file)

    policy.load_state_dict(param_dict)
    # policy = torch.load(param_file).cuda()
    print ('loaded params', param_file)
    policy.cuda()

    policies.append(policy)

    #just one for now
    break

policy = policies[0]
Ejemplo n.º 25
0
class a2c(object):
    def __init__(self, envs, hparams):

        self.use_gae = hparams['use_gae']
        self.gamma = hparams['gamma']
        self.tau = hparams['tau']

        self.obs_shape = hparams['obs_shape']
        self.num_steps = hparams['num_steps']
        self.num_processes = hparams['num_processes']
        self.value_loss_coef = hparams['value_loss_coef']
        self.entropy_coef = hparams['entropy_coef']
        self.cuda = hparams['cuda']
        self.opt = hparams['opt']
        self.grad_clip = hparams['grad_clip']

        # self.next_state_pred_ = hparams['next_state_pred_']

        # Policy and Value network
        # if 'traj_action_mask' in hparams and hparams['traj_action_mask']:
        #     self.actor_critic = CNNPolicy_trajectory_action_mask(self.obs_shape[0], envs.action_space)
        # else:
        self.actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space)

        # Storing rollouts
        self.rollouts = RolloutStorage(self.num_steps, self.num_processes,
                                       self.obs_shape, envs.action_space)

        if self.cuda:
            self.actor_critic.cuda()
            self.rollouts.cuda()

        #Optimizer
        if self.opt == 'rms':
            self.optimizer = optim.RMSprop(
                params=self.actor_critic.parameters(),
                lr=hparams['lr'],
                eps=hparams['eps'],
                alpha=hparams['alpha'])
        elif self.opt == 'adam':
            self.optimizer = optim.Adam(params=self.actor_critic.parameters(),
                                        lr=hparams['lr'],
                                        eps=hparams['eps'])
        elif self.opt == 'sgd':
            self.optimizer = optim.SGD(params=self.actor_critic.parameters(),
                                       lr=hparams['lr'],
                                       momentum=hparams['mom'])
        else:
            print('no opt specified')

        self.action_shape = 1

        if hparams['gif_'] or hparams['ls_'] or hparams['vae_'] or hparams[
                'grad_var_']:
            self.rollouts_list = RolloutStorage_list()

        self.hparams = hparams

    def act(self, current_state):

        # value, action = self.actor_critic.act(current_state)
        # [] [] [P,1] [P]
        value, action, action_log_probs, dist_entropy = self.actor_critic.act(
            current_state)

        return value, action, action_log_probs, dist_entropy

    def insert_first_state(self, current_state):

        self.rollouts.states[0].copy_(current_state)

    def insert_data(self, step, current_state, action, value, reward, masks,
                    action_log_probs, dist_entropy,
                    next_state_pred):  #, done):

        self.rollouts.insert(step, current_state, action, value, reward, masks,
                             action_log_probs, dist_entropy)
        # self.rollouts.insert_state_pred(next_state_pred)

        if 'traj_action_mask' in self.hparams and self.hparams[
                'traj_action_mask']:
            self.actor_critic.reset_mask(done)

    def update(self):

        next_value = self.actor_critic(
            Variable(self.rollouts.states[-1], volatile=True))[0].data

        self.rollouts.compute_returns(next_value, self.use_gae, self.gamma,
                                      self.tau)

        values = torch.cat(self.rollouts.value_preds,
                           0).view(self.num_steps, self.num_processes, 1)
        action_log_probs = torch.cat(self.rollouts.action_log_probs).view(
            self.num_steps, self.num_processes, 1)
        dist_entropy = torch.cat(self.rollouts.dist_entropy).view(
            self.num_steps, self.num_processes, 1)

        self.rollouts.value_preds = []
        self.rollouts.action_log_probs = []
        self.rollouts.dist_entropy = []
        self.rollouts.state_preds = []

        advantages = Variable(self.rollouts.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(advantages.detach() * action_log_probs).mean()
        cost = action_loss + value_loss * self.value_loss_coef - dist_entropy.mean(
        ) * self.entropy_coef  #*10.

        self.optimizer.zero_grad()
        cost.backward()

        nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip)

        self.optimizer.step()

    # def update2(self, discrim_error):
    #     # discrim_error: [S,P]

    #     # next_value = self.actor_critic(Variable(self.rollouts.states[-1], volatile=True))[0].data

    #     # next_value, _, _, _ = self.actor_critic.act(Variable(self.rollouts.states[-1], volatile=True), context_onehot)
    #     # next_value = next_value.data
    #     # self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau)

    #     # print (torch.mean(discrim_error, dim=0))

    #     # print (discrim_error)

    #     discrim_error_unmodified = discrim_error.data.clone()
    #     discrim_error = discrim_error.data
    #     # self.returns[-1] = next_value
    #     divide_by = torch.ones(self.num_processes).cuda()
    #     for step in reversed(range(discrim_error.size(0)-1)):
    #         divide_by += 1
    #         ttmp = discrim_error_unmodified[step + 1] * self.gamma * torch.squeeze(self.rollouts.masks[step+1])
    #         discrim_error_unmodified[step] = ttmp + discrim_error_unmodified[step]
    #         discrim_error[step] = discrim_error_unmodified[step] / divide_by
    #         divide_by = divide_by * torch.squeeze(self.rollouts.masks[step+1])
    #     discrim_error = Variable(discrim_error.view(self.num_steps,self.num_processes,1))

    #     # discrim_error = discrim_error.view(self.num_processes*self.num_steps, 1).detach()

    #     # values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) #[S,P,1]
    #     action_log_probs = torch.cat(self.rollouts.action_log_probs).view(self.num_steps, self.num_processes, 1)#[S,P,1]
    #     # dist_entropy = torch.cat(self.rollouts.dist_entropy).view(self.num_steps, self.num_processes, 1)

    #     self.rollouts.value_preds = []
    #     self.rollouts.action_log_probs = []
    #     self.rollouts.dist_entropy = []
    #     self.rollouts.state_preds = []

    #     # advantages = Variable(self.rollouts.returns[:-1]) - values
    #     # print (values)
    #     # print (discrim_error_reverse.size())  #[S,P]

    #     # discrim_error_reverse = discrim_error_reverse.view(self.num_steps, self.num_processes, 1)
    #     # val_to_maximize = (-discrim_error  + discrim_error_reverse.detach())/2. - action_log_probs.detach()

    #     val_to_maximize = -discrim_error - action_log_probs.detach()

    #     baseline = torch.mean(val_to_maximize)

    #     advantages = val_to_maximize - baseline  #- values #(-.7)#values
    #     # value_loss = advantages.pow(2).mean()

    #     # action_loss = -(advantages.detach() * action_log_probs).mean()

    #     action_loss = -(advantages.detach() * action_log_probs).mean()

    #     # print (grad_sum)
    #     # cost = action_loss - dist_entropy.mean()*self.entropy_coef # + value_loss*self.value_loss_coef # - grad_sum*100000.
    #     cost = action_loss #- dist_entropy.mean()*self.entropy_coef # + value_loss*self.value_loss_coef # - grad_sum*100000.
    #     # cost = value_loss*self.value_loss_coef - dist_entropy.mean()*self.entropy_coef - grad_sum*500.
    #     # cost =- grad_sum

    #     self.optimizer.zero_grad()
    #     cost.backward()

    #     nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip)

    #     self.optimizer.step()

    # #with reverse
    # def update2(self, discrim_error, discrim_error_reverse):
    #     # discrim_error: [S,P]

    #     # next_value = self.actor_critic(Variable(self.rollouts.states[-1], volatile=True))[0].data

    #     # next_value, _, _, _ = self.actor_critic.act(Variable(self.rollouts.states[-1], volatile=True), context_onehot)
    #     # next_value = next_value.data
    #     # self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau)

    #     # print (torch.mean(discrim_error, dim=0))

    #     # print (discrim_error)

    #     discrim_error_unmodified = discrim_error.data.clone()
    #     discrim_error = discrim_error.data
    #     # self.returns[-1] = next_value
    #     divide_by = torch.ones(self.num_processes).cuda()
    #     for step in reversed(range(discrim_error.size(0)-1)):
    #         divide_by += 1
    #         ttmp = discrim_error_unmodified[step + 1] * self.gamma * torch.squeeze(self.rollouts.masks[step+1])
    #         discrim_error_unmodified[step] = ttmp + discrim_error_unmodified[step]
    #         discrim_error[step] = discrim_error_unmodified[step] / divide_by
    #         divide_by = divide_by * torch.squeeze(self.rollouts.masks[step+1])
    #     discrim_error = Variable(discrim_error.view(self.num_steps,self.num_processes,1))

    #     # discrim_error = discrim_error.view(self.num_processes*self.num_steps, 1).detach()

    #     # values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) #[S,P,1]
    #     action_log_probs = torch.cat(self.rollouts.action_log_probs).view(self.num_steps, self.num_processes, 1)#[S,P,1]
    #     # dist_entropy = torch.cat(self.rollouts.dist_entropy).view(self.num_steps, self.num_processes, 1)

    #     self.rollouts.value_preds = []
    #     self.rollouts.action_log_probs = []
    #     self.rollouts.dist_entropy = []
    #     self.rollouts.state_preds = []

    #     # advantages = Variable(self.rollouts.returns[:-1]) - values
    #     # print (values)
    #     # print (discrim_error_reverse.size())  #[S,P]

    #     discrim_error_reverse = discrim_error_reverse.view(self.num_steps, self.num_processes, 1)

    #     val_to_maximize = (-discrim_error  + discrim_error_reverse.detach())/2. - action_log_probs.detach()

    #     baseline = torch.mean(val_to_maximize)

    #     advantages = val_to_maximize - baseline  #- values #(-.7)#values
    #     # value_loss = advantages.pow(2).mean()

    #     # action_loss = -(advantages.detach() * action_log_probs).mean()

    #     action_loss = -(advantages.detach() * action_log_probs).mean()

    #     # print (grad_sum)
    #     # cost = action_loss - dist_entropy.mean()*self.entropy_coef # + value_loss*self.value_loss_coef # - grad_sum*100000.
    #     cost = action_loss #- dist_entropy.mean()*self.entropy_coef # + value_loss*self.value_loss_coef # - grad_sum*100000.
    #     # cost = value_loss*self.value_loss_coef - dist_entropy.mean()*self.entropy_coef - grad_sum*500.
    #     # cost =- grad_sum

    #     self.optimizer.zero_grad()
    #     cost.backward()

    #     nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip)

    #     self.optimizer.step()

    #avg empowrement rather than avg error
    def update2(self, discrim_error, discrim_error_reverse):
        # discrim_error: [S,P]

        # next_value = self.actor_critic(Variable(self.rollouts.states[-1], volatile=True))[0].data

        # next_value, _, _, _ = self.actor_critic.act(Variable(self.rollouts.states[-1], volatile=True), context_onehot)
        # next_value = next_value.data
        # self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau)

        # print (torch.mean(discrim_error, dim=0))

        # print (discrim_error)
        discrim_error_reverse = discrim_error_reverse.view(
            self.num_steps, self.num_processes, 1)
        action_log_probs = torch.cat(self.rollouts.action_log_probs).view(
            self.num_steps, self.num_processes, 1)  #[S,P,1]
        discrim_error = discrim_error.view(self.num_steps, self.num_processes,
                                           1)

        # val_to_maximize = (-discrim_error  + discrim_error_reverse)/2. - action_log_probs.detach() #[S,P,1]

        val_to_maximize = -discrim_error - action_log_probs.detach()  #[S,P,1]

        val_to_maximize = val_to_maximize.view(self.num_steps,
                                               self.num_processes)  #[S,P]

        discrim_error_unmodified = val_to_maximize.data.clone()
        discrim_error = val_to_maximize.data

        # self.returns[-1] = next_value
        divide_by = torch.ones(self.num_processes).cuda()
        for step in reversed(range(discrim_error.size(0) - 1)):
            divide_by += 1
            ttmp = discrim_error_unmodified[step +
                                            1] * self.gamma * torch.squeeze(
                                                self.rollouts.masks[step + 1])
            discrim_error_unmodified[
                step] = ttmp + discrim_error_unmodified[step]
            discrim_error[step] = discrim_error_unmodified[step] / divide_by
            divide_by = divide_by * torch.squeeze(
                self.rollouts.masks[step + 1])
        val_to_maximize = Variable(
            discrim_error.view(self.num_steps, self.num_processes, 1))

        # discrim_error = discrim_error.view(self.num_processes*self.num_steps, 1).detach()

        # values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) #[S,P,1]
        # dist_entropy = torch.cat(self.rollouts.dist_entropy).view(self.num_steps, self.num_processes, 1)

        self.rollouts.value_preds = []
        self.rollouts.action_log_probs = []
        self.rollouts.dist_entropy = []
        self.rollouts.state_preds = []

        # advantages = Variable(self.rollouts.returns[:-1]) - values
        # print (values)
        # print (discrim_error_reverse.size())  #[S,P]

        # val_to_maximize = (-discrim_error  + discrim_error_reverse.detach())/2. - action_log_probs.detach()
        # val_to_maximize = discrim_error

        baseline = torch.mean(val_to_maximize)

        advantages = val_to_maximize - baseline  #- values #(-.7)#values
        # value_loss = advantages.pow(2).mean()

        # action_loss = -(advantages.detach() * action_log_probs).mean()

        action_loss = -(advantages.detach() * action_log_probs).mean()

        # print (grad_sum)
        # cost = action_loss - dist_entropy.mean()*self.entropy_coef # + value_loss*self.value_loss_coef # - grad_sum*100000.
        cost = action_loss  #- dist_entropy.mean()*self.entropy_coef # + value_loss*self.value_loss_coef # - grad_sum*100000.
        # cost = value_loss*self.value_loss_coef - dist_entropy.mean()*self.entropy_coef - grad_sum*500.
        # cost =- grad_sum

        self.optimizer.zero_grad()
        cost.backward()

        nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip)

        self.optimizer.step()
Ejemplo n.º 26
0
#     param_file = home+'/Documents/tmp/RoadRunner/RoadRunnerNoFrameskip-v4/A2C/'+f+'/model_params3/model_params9999360.pt'
#     param_dict = torch.load(param_file)

#     policy.load_state_dict(param_dict)
#     # policy = torch.load(param_file).cuda()
#     print ('loaded params', param_file)
#     policy.cuda()

#     policies.append(policy)

#     #just one for now
#     break

# policy = policies[0]

policy = CNNPolicy(2 * 3, 18)  #.cuda()   #num-frames* channels, num-actions

# param_file = home+'/Documents/tmp/multiple_seeds_of_policies/BreakoutNoFrameskip-v4/A2C/'+f+'/model_params3/model_params9999360.pt'
# param_file = home+'/Documents/tmp/RoadRunner/RoadRunnerNoFrameskip-v4/A2C/'+f+'/model_params3/model_params9999360.pt'
param_file = home + '/Documents/tmp/' + exp_name + '/' + env_name + 'NoFrameskip-v4/A2C/seed0/model_params3/model_params3999840.pt'

param_dict = torch.load(param_file)
policy.load_state_dict(param_dict)
# policy = torch.load(param_file).cuda()
print('loaded params', param_file)
policy.cuda()


class MASK_PREDICTOR(nn.Module):
    def __init__(self):
        super(MASK_PREDICTOR, self).__init__()