Ejemplo n.º 1
0
    def initialise_policy(self):

        # initialise policy network
        policy_net = Policy(
            args=self.args,
            #
            pass_state_to_policy=self.args.pass_state_to_policy,
            pass_latent_to_policy=self.args.pass_latent_to_policy,
            pass_belief_to_policy=self.args.pass_belief_to_policy,
            pass_task_to_policy=self.args.pass_task_to_policy,
            dim_state=self.args.state_dim,
            dim_latent=self.args.latent_dim * 2,
            dim_belief=self.args.belief_dim,
            dim_task=self.args.task_dim,
            #
            hidden_layers=self.args.policy_layers,
            activation_function=self.args.policy_activation_function,
            policy_initialisation=self.args.policy_initialisation,
            #
            action_space=self.envs.action_space,
            init_std=self.args.policy_init_std,
        ).to(device)

        # initialise policy trainer
        if self.args.policy == 'a2c':
            policy = A2C(
                self.args,
                policy_net,
                self.args.policy_value_loss_coef,
                self.args.policy_entropy_coef,
                policy_optimiser=self.args.policy_optimiser,
                policy_anneal_lr=self.args.policy_anneal_lr,
                train_steps=self.num_updates,
                optimiser_vae=self.vae.optimiser_vae,
                lr=self.args.lr_policy,
                eps=self.args.policy_eps,
            )
        elif self.args.policy == 'ppo':
            policy = PPO(
                self.args,
                policy_net,
                self.args.policy_value_loss_coef,
                self.args.policy_entropy_coef,
                policy_optimiser=self.args.policy_optimiser,
                policy_anneal_lr=self.args.policy_anneal_lr,
                train_steps=self.num_updates,
                lr=self.args.lr_policy,
                eps=self.args.policy_eps,
                ppo_epoch=self.args.ppo_num_epochs,
                num_mini_batch=self.args.ppo_num_minibatch,
                use_huber_loss=self.args.ppo_use_huberloss,
                use_clipped_value_loss=self.args.ppo_use_clipped_value_loss,
                clip_param=self.args.ppo_clip_param,
                optimiser_vae=self.vae.optimiser_vae,
            )
        else:
            raise NotImplementedError

        return policy
Ejemplo n.º 2
0
    def validate(self, data: Dict):
        super(Role, self).validate(data)

        policies = data.get('policies')

        policy_objs = Policy.get(filters={'id': policies})

        if len(policies) != len(policy_objs):
            raise ValidationError(message=f'Failed to add policies to role.\n'
                                  f'Few policy id seems to be invalid.')
Ejemplo n.º 3
0
def load_policy_model(args, environment, device, folder=None):
    parent_folder = './checkpoint/policy'
    path = folder if folder is not None else parent_folder

    model = Policy(environment['action'],
                   net=args.encoder,
                   pretrained=args.pretrained,
                   input=environment['input_size'])
    model.load_state_dict(torch.load(f'{path}/best_model.ckpt'))
    model = model.to(device)
    model.eval()
    return model
env = gym.make(args.env_name)
state_dim = env.observation_space.shape[0]
is_disc_action = len(env.action_space.shape) == 0
running_state = ZFilter((state_dim, ), clip=5)
# running_reward = ZFilter((1,), demean=False, clip=10)
"""seeding"""
np.random.seed(args.seed)
torch.manual_seed(args.seed)
env.seed(args.seed)
"""define actor and critic"""
if args.model_path is None:
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim, env.action_space.n)
    else:
        policy_net = Policy(state_dim,
                            env.action_space.shape[0],
                            log_std=args.log_std)
    value_net = Value(state_dim)
else:
    policy_net, value_net, running_state = pickle.load(
        open(args.model_path, "rb"))
policy_net.to(device)
value_net.to(device)

optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=4e-4)
optimizer_value = torch.optim.Adam(value_net.parameters(), lr=8e-4)
"""create agent"""
agent = Agent(env,
              policy_net,
              device,
              running_state=running_state,
Ejemplo n.º 5
0
def main():
    args = parse_args()

    # Initialize
    dbmng = DBManager(args.db)

    # Only one session to access to database
    _session = dbmng.get_session()

    # Open the file
    decorder = json.JSONDecoder(object_pairs_hook=collections.OrderedDict)
    input_f = open(args.input_file, 'r')
    input_j = decorder.decode(input_f.read())

    # Load json keys and make it inserted
    nowtime = datetime.datetime.now()

    for key in input_j.keys():
        if key == 'translations':
            for dic in input_j[key]:
                rec = Translation(id=dic['id'],
                                  locale=dic['locale'],
                                  t=dic['t'],
                                  created_at=nowtime,
                                  created_by='rightctl_initializer')
                _session.add(rec)
            else:
                _session.commit()

        elif key == 'rights':
            for dic in input_j[key]:
                rec = Right(action=dic['action'],
                            enable_flag=True,
                            created_at=nowtime,
                            created_by='rightctl_initializer')
                _session.add(rec)
            else:
                _session.commit()

        elif key == 'policies':
            for dic in input_j[key]:
                rec = Policy(id=dic['id'],
                             policy_tid=dic['policy_tid'],
                             created_at=nowtime,
                             created_by='rightctl_initializer')
                _session.add(rec)
                _session.commit()
                for right_s in dic['rights']:
                    right = _session.query(Right).filter(
                        Right.action == right_s).one()
                    child = PolicyHasRight(policy_id=rec.id, right_id=right.id)
                    _session.add(child)
                else:
                    _session.commit()
            else:
                _session.commit()

        elif key == 'roles':
            for dic in input_j[key]:
                rec = Role(id=dic['id'],
                           role_tid=dic['role_tid'],
                           created_at=nowtime,
                           created_by='rightctl_initializer')
                _session.add(rec)
                _session.commit()
                for policy_s in dic['policies']:
                    policy = _session.query(Policy).filter(
                        Policy.id == policy_s).one()
                    child = RoleHasPolicy(role_id=rec.id, policy_id=policy.id)
                    _session.add(child)
                else:
                    _session.commit()
            else:
                _session.commit()

        else:
            None
Ejemplo n.º 6
0
)
get_policy_dataset = environment['policy_dataset']
policy_train, policy_validation = get_policy_dataset(
    args.expert_path,
    args.policy_batch_size,
    maze_size=args.maze_size,
    maze_type=args.maze_type,
)

# Model and action size
print('\nCreating Models')
action_dimension = environment['action']
inputs = environment['input_size'] * 2 if environment[
    'input_size'] is not None else None
policy_model = Policy(action_dimension,
                      net=args.encoder,
                      pretrained=args.pretrained,
                      input=environment['input_size'])
idm_model = IDM(action_dimension,
                net=args.encoder,
                pretrained=args.pretrained,
                input=inputs)

policy_model.to(device)
idm_model.to(device)

# Optimizer and loss
print('\nCreating Optimizer and Loss')
print(f'IDM learning rate: {args.lr}\nPolicy learning rate: {args.policy_lr}')
idm_lr = args.lr
idm_criterion = nn.CrossEntropyLoss()
idm_optimizer = optim.Adam(idm_model.parameters(), lr=idm_lr)
Ejemplo n.º 7
0
    def __init__(self,
                 env_def,
                 processes=1,
                 dir='.',
                 version=0,
                 lr=2e-4,
                 architecture='base',
                 dropout=0,
                 reconstruct=None,
                 r_weight=.05):
        self.env_def = env_def
        self.num_processes = processes  #cpu processes
        self.lr = lr
        self.version = version
        self.save_dir = dir + '/trained_models/'

        #Setup
        pathlib.Path(self.save_dir).mkdir(parents=True, exist_ok=True)
        if (self.num_mini_batch > processes):
            self.num_mini_batch = processes

        self.writer = SummaryWriter()
        self.total_steps = 0

        #State
        torch.manual_seed(self.seed)
        torch.cuda.manual_seed_all(self.seed)

        if not self.no_cuda and torch.cuda.is_available(
        ) and self.cuda_deterministic:
            torch.backends.cudnn.benchmark = False
            torch.backends.cudnn.deterministic = True

        utils.cleanup_log_dir(self.log_dir)
        utils.cleanup_log_dir(self.eval_log_dir)

        torch.set_num_threads(1)

        self.level_path = None
        self.envs = None
        self.num_envs = -1
        self.set_envs(num_envs=1)

        if (version > 0):
            self.actor_critic = self.load(path, version)
        else:
            self.actor_critic = Policy(
                self.envs.observation_space.shape,
                self.envs.action_space,
                base_kwargs={
                    'recurrent': self.recurrent_policy,
                    'shapes': list(reversed(self.env_def.model_shape)),
                    'dropout': dropout
                },
                model=architecture)
        self.actor_critic.to(self.device)

        #Reconstruction
        self.reconstruct = reconstruct is not None
        if (self.reconstruct):
            #layers = self.envs.observation_space.shape[0]
            #shapes = list(self.env_def.model_shape)
            #self.r_model = Decoder(layers, shapes=shapes).to(self.device)
            reconstruct.to(self.device)
            self.r_model = lambda x: reconstruct.adapter(reconstruct(x))
            #self.r_model = lambda x: reconstruct.adapter(reconstruct(x)).clamp(min=1e-6).log()
            #self.r_loss = nn.L1Loss() #nn.NLLLoss() #nn.MSELoss()
            self.r_loss = lambda pred, true: -r_weight * (true * torch.log(
                pred.clamp(min=1e-7, max=1 - 1e-7))).sum(dim=1).mean()
            self.r_optimizer = reconstruct.optimizer  #optim.Adam(reconstruct.parameters(), lr = .0001)

        if self.algo == 'a2c':
            self.agent = A2C_ACKTR(self.actor_critic,
                                   self.value_loss_coef,
                                   self.entropy_coef,
                                   lr=self.lr,
                                   eps=self.eps,
                                   alpha=self.alpha,
                                   max_grad_norm=self.max_grad_norm)
        elif self.algo == 'ppo':
            self.agent = PPO(self.actor_critic,
                             self.clip_param,
                             self.ppo_epoch,
                             self.num_mini_batch,
                             self.value_loss_coef,
                             self.entropy_coef,
                             lr=self.lr,
                             eps=self.eps,
                             max_grad_norm=self.max_grad_norm,
                             use_clipped_value_loss=False)
        elif self.algo == 'acktr':
            self.agent = algo.A2C_ACKTR(self.actor_critic,
                                        self.value_loss_coef,
                                        self.entropy_coef,
                                        acktr=True)

        self.gail = False
        self.gail_experts_dir = './gail_experts'
        if self.gail:
            assert len(self.envs.observation_space.shape) == 1
            self.gail_discr = gail.Discriminator(
                self.envs.observation_space.shape[0] +
                self.envs.action_space.shape[0], 100, self.device)
            file_name = os.path.join(
                self.gail_experts_dir,
                "trajs_{}.pt".format(env_name.split('-')[0].lower()))

            self.gail_train_loader = torch.utils.data.DataLoader(
                gail.ExpertDataset(file_name,
                                   num_trajectories=4,
                                   subsample_frequency=20),
                batch_size=self.gail_batch_size,
                shuffle=True,
                drop_last=True)

        self.rollouts = RolloutStorage(
            self.num_steps, self.num_processes,
            self.envs.observation_space.shape, self.envs.action_space,
            self.actor_critic.recurrent_hidden_state_size)
Ejemplo n.º 8
0
class Agent:
    #algorithm
    algo = 'a2c'  #a2c, ppo, acktr
    use_gae = False  #generalized advantage estimation
    gae_lambda = 0.95
    entropy_coef = 0.01  #weight maximizing action entropy loss
    value_loss_coef = 0.1  #.5 #weight value function loss
    max_grad_norm = 0.5  #max norm of gradients

    #ppo hyperparameters
    clip_param = 0.2  #ppo clip
    num_steps = 5  #steps before an update
    ppo_epoch = 4
    num_mini_batch = 32

    seed = 1
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    cuda_deterministic = False
    no_cuda = False
    use_proper_time_limits = False
    use_linear_lr_decay = False

    #experimnent setup
    log_interval = 1  #log per n updates
    log_dir = os.path.expanduser('/tmp/gym')
    eval_log_dir = log_dir + "_eval"
    save_interval = 100
    eval_interval = None
    recurrent_policy = True

    #optimization, RMSprop and TD
    eps = 1e-5  #epsilon
    alpha = 0.99
    gamma = 0.99  #discount factor

    #imitation learning with gail
    gail_batch_size = 128
    gail_epoch = 5

    def __init__(self,
                 env_def,
                 processes=1,
                 dir='.',
                 version=0,
                 lr=2e-4,
                 architecture='base',
                 dropout=0,
                 reconstruct=None,
                 r_weight=.05):
        self.env_def = env_def
        self.num_processes = processes  #cpu processes
        self.lr = lr
        self.version = version
        self.save_dir = dir + '/trained_models/'

        #Setup
        pathlib.Path(self.save_dir).mkdir(parents=True, exist_ok=True)
        if (self.num_mini_batch > processes):
            self.num_mini_batch = processes

        self.writer = SummaryWriter()
        self.total_steps = 0

        #State
        torch.manual_seed(self.seed)
        torch.cuda.manual_seed_all(self.seed)

        if not self.no_cuda and torch.cuda.is_available(
        ) and self.cuda_deterministic:
            torch.backends.cudnn.benchmark = False
            torch.backends.cudnn.deterministic = True

        utils.cleanup_log_dir(self.log_dir)
        utils.cleanup_log_dir(self.eval_log_dir)

        torch.set_num_threads(1)

        self.level_path = None
        self.envs = None
        self.num_envs = -1
        self.set_envs(num_envs=1)

        if (version > 0):
            self.actor_critic = self.load(path, version)
        else:
            self.actor_critic = Policy(
                self.envs.observation_space.shape,
                self.envs.action_space,
                base_kwargs={
                    'recurrent': self.recurrent_policy,
                    'shapes': list(reversed(self.env_def.model_shape)),
                    'dropout': dropout
                },
                model=architecture)
        self.actor_critic.to(self.device)

        #Reconstruction
        self.reconstruct = reconstruct is not None
        if (self.reconstruct):
            #layers = self.envs.observation_space.shape[0]
            #shapes = list(self.env_def.model_shape)
            #self.r_model = Decoder(layers, shapes=shapes).to(self.device)
            reconstruct.to(self.device)
            self.r_model = lambda x: reconstruct.adapter(reconstruct(x))
            #self.r_model = lambda x: reconstruct.adapter(reconstruct(x)).clamp(min=1e-6).log()
            #self.r_loss = nn.L1Loss() #nn.NLLLoss() #nn.MSELoss()
            self.r_loss = lambda pred, true: -r_weight * (true * torch.log(
                pred.clamp(min=1e-7, max=1 - 1e-7))).sum(dim=1).mean()
            self.r_optimizer = reconstruct.optimizer  #optim.Adam(reconstruct.parameters(), lr = .0001)

        if self.algo == 'a2c':
            self.agent = A2C_ACKTR(self.actor_critic,
                                   self.value_loss_coef,
                                   self.entropy_coef,
                                   lr=self.lr,
                                   eps=self.eps,
                                   alpha=self.alpha,
                                   max_grad_norm=self.max_grad_norm)
        elif self.algo == 'ppo':
            self.agent = PPO(self.actor_critic,
                             self.clip_param,
                             self.ppo_epoch,
                             self.num_mini_batch,
                             self.value_loss_coef,
                             self.entropy_coef,
                             lr=self.lr,
                             eps=self.eps,
                             max_grad_norm=self.max_grad_norm,
                             use_clipped_value_loss=False)
        elif self.algo == 'acktr':
            self.agent = algo.A2C_ACKTR(self.actor_critic,
                                        self.value_loss_coef,
                                        self.entropy_coef,
                                        acktr=True)

        self.gail = False
        self.gail_experts_dir = './gail_experts'
        if self.gail:
            assert len(self.envs.observation_space.shape) == 1
            self.gail_discr = gail.Discriminator(
                self.envs.observation_space.shape[0] +
                self.envs.action_space.shape[0], 100, self.device)
            file_name = os.path.join(
                self.gail_experts_dir,
                "trajs_{}.pt".format(env_name.split('-')[0].lower()))

            self.gail_train_loader = torch.utils.data.DataLoader(
                gail.ExpertDataset(file_name,
                                   num_trajectories=4,
                                   subsample_frequency=20),
                batch_size=self.gail_batch_size,
                shuffle=True,
                drop_last=True)

        self.rollouts = RolloutStorage(
            self.num_steps, self.num_processes,
            self.envs.observation_space.shape, self.envs.action_space,
            self.actor_critic.recurrent_hidden_state_size)

    def load(self, path, version):
        policy = torch.load(os.path.join(path, "agent_{}.tar".format(version)))
        #utils.get_vec_normalize(self.envs).ob_rms = ob_rms
        self.actor_critic = policy

    def save(self, path, version):
        #ob_rms = getattr(utils.get_vec_normalize(self.envs), 'ob_rms', None)
        torch.save(self.actor_critic,
                   os.path.join(path, "agent_{}.tar".format(version)))

    def report(self, version, total_num_steps, FPS, rewards):
        file_path = os.path.join(self.save_dir, "actor_critic_results.csv")
        add_header = not os.path.exists(file_path)
        if (len(rewards) > 0):
            mean, median, min, max = np.mean(rewards), np.median(
                rewards), np.min(rewards), np.max(rewards)
        else:
            mean, median, min, max = np.nan, np.nan, np.nan, np.nan
        with open(file_path, 'a+') as results:
            writer = csv.writer(results)
            if (add_header):
                header = [
                    'update', 'total_steps', 'FPS', 'mean_reward',
                    'median_reward', 'min_reward', 'max_reward'
                ]
                writer.writerow(header)
            writer.writerow(
                (version, total_num_steps, FPS, mean, median, min, max))

    def set_envs(self, level_path=None, num_envs=None):
        num_envs = num_envs if num_envs else self.num_processes
        if (level_path != self.level_path or self.envs is None
                or num_envs != self.num_envs):
            if (self.envs is not None):
                self.envs.close()
            self.level_path = level_path
            self.envs = make_vec_envs(self.env_def, level_path, self.seed,
                                      num_envs, self.gamma, self.log_dir,
                                      self.device, True)
        self.num_envs = num_envs

    def update_reconstruction(self, rollouts):
        s, p, l, w, h = list(rollouts.obs.size())
        x = rollouts.obs.view(-1, l, w, h)
        hidden = rollouts.recurrent_hidden_states.view(s * p, -1)
        mask = rollouts.masks.view(s * p, -1)
        #y = x.argmax(1)
        y = x

        self.r_optimizer.zero_grad()
        self.agent.optimizer.zero_grad()
        _, predictions, _ = self.actor_critic.base(x, hidden, mask)
        reconstructions = self.r_model(predictions)
        loss = self.r_loss(reconstructions, y)
        loss.backward()
        self.r_optimizer.step()
        self.agent.optimizer.step()
        return loss

    def update_reconstruct_next(self, rollouts):
        #Mask frames that are not relevant
        mask = rollouts.masks.unfold(0, 2, 1).min(-1)[0]
        mask = mask.view(-1)
        mask = torch.nonzero(mask).squeeze()

        #Image Pairs
        l, w, h = list(rollouts.obs.size())[2:]
        img_pairs = rollouts.obs.unfold(0, 2, 1)  #128, 8, 14, 12, 16, 2
        img_pairs = img_pairs.view(-1, l, w, h, 2)
        img_pairs = img_pairs[mask]
        x = img_pairs[:, :, :, :, 0]
        y = img_pairs[:, :, :, :, 1]

        #Input hidden states
        hidden_size = rollouts.recurrent_hidden_states.size(2)
        hidden = rollouts.recurrent_hidden_states[:-1].view(
            -1, hidden_size)  #129, 8, 512
        hidden = hidden[mask]

        #Update model
        self.r_optimizer.zero_grad()
        mask = torch.ones_like(mask).float().unsqueeze(1)
        _, predictions, _ = self.actor_critic.base(x, hidden, mask)
        reconstructions = self.r_model(predictions)
        loss = self.r_loss(
            reconstructions,
            y)  #model -> x or x and a? x already contains action features
        loss.backward()
        self.r_optimizer.step()
        print(loss.item())  #add loss weight
        return loss

    def play(self, env, runs=1, visual=False):
        env = GridGame()
        reward_mean = 0
        for i in range(runs):
            score = self.play_game(env, visual)
            reward_mean += score / runs
        return score_mean

    def play_game(self, level):
        eval_envs = make_vec_envs(env_name, self.seed + self.num_processes,
                                  self.num_processes, None, eval_log_dir,
                                  device, True)

        vec_norm = utils.get_vec_normalize(eval_envs)
        if vec_norm is not None:
            vec_norm.eval()
            vec_norm.ob_rms = ob_rms

        eval_episode_rewards = []

        obs = eval_envs.reset()
        eval_recurrent_hidden_states = torch.zeros(
            self.num_processes,
            self.actor_critic.recurrent_hidden_state_size).to(self.device)
        eval_masks = torch.zeros(self.num_processes, 1).to(self.device)

        while len(eval_episode_rewards) < 10:
            with torch.no_grad():
                _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                    obs,
                    eval_recurrent_hidden_states,
                    eval_masks,
                    deterministic=True)

            # Obser reward and next obs
            obs, _, done, infos = eval_envs.step(action)
            eval_masks = torch.tensor([[0.0] if done_ else [1.0]
                                       for done_ in done],
                                      dtype=torch.float32).to(device)

            if (done):
                print("Done!")
        eval_envs.close()

    def train_agent(self, num_env_steps):
        env_name = self.env_def.name

        obs = self.envs.reset()
        self.rollouts.obs[0].copy_(obs)
        self.rollouts.to(self.device)

        n = 30
        episode_rewards = deque(maxlen=n)
        episode_values = deque(maxlen=n)
        episode_end_values = deque(maxlen=n)
        episode_end_probs = deque(maxlen=n)
        episode_lengths = deque(maxlen=n)
        compile_est = deque(maxlen=n)
        first_steps = [True for i in range(self.num_processes)]

        start = time.time()
        num_updates = int(
            num_env_steps) // self.num_steps // self.num_processes
        for j in range(num_updates):

            if self.use_linear_lr_decay:
                # decrease learning rate linearly
                utils.update_linear_schedule(
                    self.agent.optimizer, j, num_updates,
                    self.agent.optimizer.lr
                    if self.algo == "acktr" else self.lr)

            for step in range(self.num_steps):
                # Sample actions
                with torch.no_grad():
                    value, Q, action, action_prob, action_log_prob, recurrent_hidden_states = \
                        self.actor_critic.act(self.rollouts.obs[step],
                            self.rollouts.recurrent_hidden_states[step],
                            self.rollouts.masks[step])

                # Observe reward and next obs
                obs, reward, done, infos = self.envs.step(action)

                for i, step in enumerate(first_steps):
                    if step:
                        episode_values.append(value[i].item())
                    elif (done[i]):
                        episode_end_values.append(Q[i].item())
                        episode_end_probs.append(action_log_prob[i].item())
                first_steps = done

                for worker, info in enumerate(infos):
                    if 'episode' in info.keys():
                        r = info['episode']['r']
                        l = info['episode']['l']
                        episode_rewards.append(r)
                        episode_lengths.append(l)
                        if (r < -1):
                            compile_est.append(value[worker].item())

                # If done then clean the history of observations.
                masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                           for done_ in done])
                bad_masks = torch.FloatTensor(
                    [[0.0] if 'bad_transition' in info.keys() else [1.0]
                     for info in infos])
                self.rollouts.insert(obs, recurrent_hidden_states, action,
                                     action_prob, action_log_prob, value, Q,
                                     reward, masks, bad_masks)

            with torch.no_grad():
                next_value = self.actor_critic.get_value(
                    self.rollouts.obs[-1],
                    self.rollouts.recurrent_hidden_states[-1],
                    self.rollouts.masks[-1]).detach()

            if self.gail:
                if j >= 10:
                    self.envs.venv.eval()

                gail_epoch = self.gail_epoch
                if j < 10:
                    gail_epoch = 100  # Warm up
                for _ in range(gail_epoch):
                    self.gail_discr.update(
                        self.gail_train_loader, self.rollouts,
                        utils.get_vec_normalize(self.envs)._obfilt)

                for step in range(self.num_steps):
                    self.rollouts.rewards[
                        step] = self.gail_discr.predict_reward(
                            self.rollouts.obs[step],
                            self.rollouts.actions[step], self.gamma,
                            self.rollouts.masks[step])

            self.rollouts.compute_returns(next_value, self.use_gae, self.gamma,
                                          self.gae_lambda,
                                          self.use_proper_time_limits)

            value_loss, action_loss, dist_entropy = self.agent.update(
                self.rollouts)
            if (self.reconstruct):
                recon_loss = self.update_reconstruction(self.rollouts)
                self.writer.add_scalar('generator/Reconstruction Loss',
                                       recon_loss.item(), self.total_steps)

            self.rollouts.after_update()

            #Tensorboard Reporting
            self.total_steps += self.num_processes * self.num_steps
            self.writer.add_scalar('value/Mean Reward',
                                   np.mean(episode_rewards), self.total_steps)
            self.writer.add_scalar('value/Episode Mean Length',
                                   np.mean(episode_lengths), self.total_steps)
            self.writer.add_scalar('policy/Action Loss', action_loss,
                                   self.total_steps)
            self.writer.add_scalar('value/Value Loss', value_loss,
                                   self.total_steps)
            self.writer.add_scalar('policy/Distribution Entropy', dist_entropy,
                                   self.total_steps)
            self.writer.add_scalar('value/Win Probability',
                                   np.mean(np.array(episode_rewards) > 0),
                                   self.total_steps)
            self.writer.add_scalar('value/Starting Value',
                                   np.mean(episode_values), self.total_steps)
            #self.writer.add_scalar('value/Ending Value', np.mean(episode_end_values), self.total_steps)
            self.writer.add_scalar('value/Log Probs',
                                   np.mean(episode_end_probs),
                                   self.total_steps)
            if (len(compile_est) > 0):
                self.writer.add_scalar('value/Compile Estimate',
                                       np.mean(compile_est), self.total_steps)

            # save for every interval-th episode or for the last epoch
            total_num_steps = (j + 1) * self.num_processes * self.num_steps
            end = time.time()
            if (j % self.save_interval == 0
                    or j == num_updates - 1) and self.save_dir != "":
                self.version += 1
                #self.save(self.version)
                self.report(self.version, total_num_steps,
                            int(total_num_steps / (end - start)),
                            episode_rewards)

            if j % self.log_interval == 0 and len(episode_rewards) > 1:
                print(
                    "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                    .format(j, total_num_steps,
                            int(total_num_steps / (end - start)),
                            len(episode_rewards), np.mean(episode_rewards),
                            np.median(episode_rewards),
                            np.min(episode_rewards), np.max(episode_rewards),
                            dist_entropy, value_loss, action_loss))
Ejemplo n.º 9
0
    def initialise_policy(self):

        # initialise rollout storage for the policy
        self.policy_storage = OnlineStorage(
            self.args,
            self.args.policy_num_steps,
            self.args.num_processes,
            self.args.obs_dim,
            self.args.act_space,
            hidden_size=self.args.aggregator_hidden_size,
            latent_dim=self.args.latent_dim,
            normalise_observations=self.args.norm_obs_for_policy,
            normalise_rewards=self.args.norm_rew_for_policy,
        )

        # initialise policy network
        input_dim = self.args.obs_dim * int(
            self.args.condition_policy_on_state)
        input_dim += (
            1 + int(not self.args.sample_embeddings)) * self.args.latent_dim

        if hasattr(self.envs.action_space, 'low'):
            action_low = self.envs.action_space.low
            action_high = self.envs.action_space.high
        else:
            action_low = action_high = None

        policy_net = Policy(
            state_dim=input_dim,
            action_space=self.args.act_space,
            init_std=self.args.policy_init_std,
            hidden_layers=self.args.policy_layers,
            activation_function=self.args.policy_activation_function,
            normalise_actions=self.args.normalise_actions,
            action_low=action_low,
            action_high=action_high,
        ).to(device)

        # initialise policy trainer
        if self.args.policy == 'a2c':
            self.policy = A2C(
                policy_net,
                self.args.policy_value_loss_coef,
                self.args.policy_entropy_coef,
                optimiser_vae=self.vae.optimiser_vae,
                lr=self.args.lr_policy,
                eps=self.args.policy_eps,
                alpha=self.args.a2c_alpha,
            )
        elif self.args.policy == 'ppo':
            self.policy = PPO(
                policy_net,
                self.args.policy_value_loss_coef,
                self.args.policy_entropy_coef,
                optimiser_vae=self.vae.optimiser_vae,
                lr=self.args.lr_policy,
                eps=self.args.policy_eps,
                ppo_epoch=self.args.ppo_num_epochs,
                num_mini_batch=self.args.ppo_num_minibatch,
                use_huber_loss=self.args.ppo_use_huberloss,
                use_clipped_value_loss=self.args.ppo_use_clipped_value_loss,
                clip_param=self.args.ppo_clip_param,
            )
        else:
            raise NotImplementedError
Ejemplo n.º 10
0
    def initialise_policy(self):

        if hasattr(self.envs.action_space, 'low'):
            action_low = self.envs.action_space.low
            action_high = self.envs.action_space.high
        else:
            action_low = action_high = None

        # initialise policy network
        policy_net = Policy(
            args=self.args,
            #
            pass_state_to_policy=self.args.pass_state_to_policy,
            pass_latent_to_policy=
            False,  # use metalearner.py if you want to use the VAE
            pass_belief_to_policy=self.args.pass_belief_to_policy,
            pass_task_to_policy=self.args.pass_task_to_policy,
            dim_state=self.args.state_dim,
            dim_latent=0,
            dim_belief=self.args.belief_dim,
            dim_task=self.args.task_dim,
            #
            hidden_layers=self.args.policy_layers,
            activation_function=self.args.policy_activation_function,
            policy_initialisation=self.args.policy_initialisation,
            #
            action_space=self.envs.action_space,
            init_std=self.args.policy_init_std,
            norm_actions_of_policy=self.args.norm_actions_of_policy,
            action_low=action_low,
            action_high=action_high,
        ).to(device)

        # initialise policy trainer
        if self.args.policy == 'a2c':
            policy = A2C(
                self.args,
                policy_net,
                self.args.policy_value_loss_coef,
                self.args.policy_entropy_coef,
                policy_optimiser=self.args.policy_optimiser,
                policy_anneal_lr=self.args.policy_anneal_lr,
                train_steps=self.num_updates,
                lr=self.args.lr_policy,
                eps=self.args.policy_eps,
            )
        elif self.args.policy == 'ppo':
            policy = PPO(
                self.args,
                policy_net,
                self.args.policy_value_loss_coef,
                self.args.policy_entropy_coef,
                policy_optimiser=self.args.policy_optimiser,
                policy_anneal_lr=self.args.policy_anneal_lr,
                train_steps=self.num_updates,
                lr=self.args.lr_policy,
                eps=self.args.policy_eps,
                ppo_epoch=self.args.ppo_num_epochs,
                num_mini_batch=self.args.ppo_num_minibatch,
                use_huber_loss=self.args.ppo_use_huberloss,
                use_clipped_value_loss=self.args.ppo_use_clipped_value_loss,
                clip_param=self.args.ppo_clip_param,
            )
        else:
            raise NotImplementedError

        return policy
Ejemplo n.º 11
0
        os.makedirs(args.model_dir)
    except:
        shutil.rmtree(args.model_dir)
        os.makedirs(args.model_dir)

    env = deepmind_lab.Lab('tests/empty_room_test', ['RGB_INTERLEAVED'],
                           config=CONFIG)
    env.reset()

    obs_shape = env.observations()['RGB_INTERLEAVED'].shape
    obs_shape = (obs_shape[2], obs_shape[0], obs_shape[1])
    print('Observation Space: ', obs_shape)
    action_space = Discrete(9)
    env.close()

    actor_critic = Policy(obs_shape, action_space)
    actor_critic.to(args.device)

    learner = Learner(args, q_batch, actor_critic)

    for i in range(len(LEVELS)):
        print('Build Actor {:d}'.format(i))
        rollouts = RolloutStorage(args.num_steps, 1, obs_shape, action_space,
                                  actor_critic.recurrent_hidden_state_size)
        actor_critic = Policy(obs_shape, action_space)
        actor_critic.to(args.device)

        actor_name = 'actor_' + str(i)
        actor = Actor(args, q_trace, learner, actor_critic, rollouts,
                      LEVELS[i], actor_name)
        actors.append(actor)
Ejemplo n.º 12
0
 def Policy(self):
     return Policy(self)
Ejemplo n.º 13
0
    def initialise_policy(self):

        # variables for task encoder (used for oracle)
        state_dim = self.envs.observation_space.shape[0]

        # TODO: this isn't ideal, find a nicer way to get the task dimension!
        if 'BeliefOracle' in self.args.env_name:
            task_dim = gym.make(self.args.env_name).observation_space.shape[0] - \
                       gym.make(self.args.env_name.replace('BeliefOracle', '')).observation_space.shape[0]
            latent_dim = self.args.latent_dim
            state_embedding_size = self.args.state_embedding_size
            use_task_encoder = True
        elif 'Oracle' in self.args.env_name:
            task_dim = gym.make(self.args.env_name).observation_space.shape[0] - \
                       gym.make(self.args.env_name.replace('Oracle', '')).observation_space.shape[0]
            latent_dim = self.args.latent_dim
            state_embedding_size = self.args.state_embedding_size
            use_task_encoder = True
        else:
            task_dim = latent_dim = state_embedding_size = 0
            use_task_encoder = False

        # initialise rollout storage for the policy
        self.policy_storage = OnlineStorage(
            self.args,
            self.args.policy_num_steps,
            self.args.num_processes,
            self.args.obs_dim,
            self.args.act_space,
            hidden_size=0,
            latent_dim=self.args.latent_dim,
            normalise_observations=self.args.norm_obs_for_policy,
            normalise_rewards=self.args.norm_rew_for_policy,
        )

        if hasattr(self.envs.action_space, 'low'):
            action_low = self.envs.action_space.low
            action_high = self.envs.action_space.high
        else:
            action_low = action_high = None

        # initialise policy network
        policy_net = Policy(
            # general
            state_dim=int(self.args.condition_policy_on_state) * state_dim,
            action_space=self.envs.action_space,
            init_std=self.args.policy_init_std,
            hidden_layers=self.args.policy_layers,
            activation_function=self.args.policy_activation_function,
            use_task_encoder=use_task_encoder,
            # task encoding things (for oracle)
            task_dim=task_dim,
            latent_dim=latent_dim,
            state_embed_dim=state_embedding_size,
            #
            normalise_actions=self.args.normalise_actions,
            action_low=action_low,
            action_high=action_high,
        ).to(device)

        # initialise policy
        if self.args.policy == 'a2c':
            # initialise policy trainer (A2C)
            self.policy = A2C(
                policy_net,
                self.args.policy_value_loss_coef,
                self.args.policy_entropy_coef,
                lr=self.args.lr_policy,
                eps=self.args.policy_eps,
                alpha=self.args.a2c_alpha,
            )
        elif self.args.policy == 'ppo':
            # initialise policy network
            self.policy = PPO(
                policy_net,
                self.args.policy_value_loss_coef,
                self.args.policy_entropy_coef,
                lr=self.args.lr_policy,
                eps=self.args.policy_eps,
                ppo_epoch=self.args.ppo_num_epochs,
                num_mini_batch=self.args.ppo_num_minibatch,
                use_huber_loss=self.args.ppo_use_huberloss,
                use_clipped_value_loss=self.args.ppo_use_clipped_value_loss,
                clip_param=self.args.ppo_clip_param,
            )
        else:
            raise NotImplementedError
seg.eval()
if opt_seg.ft:
    checkpoint = torch.load(opt_seg.ft_resume)
    seg.module.load_state_dict(checkpoint['state_dict'], strict=False)
    # self.logger.info("=> loaded checkpoint '{}' (epoch {})".format(args.ft_resume, checkpoint['epoch']))
####################################################

##### Policy #################################################
from models.policy import Policy

lr = 3e-4
gamma = 1
lambd_entropy = 0.3
# hidden_dim: # action choices
# policy = Policy(hidden_dim=2, rnn_type='lstm')
policy = Policy(hidden_dim=4, input_dim=23, rnn_type=None)
r_neg = 5
r_pos = 5

if evaluation == "policy":
    # checkpoint = torch.load("/home/chenwy/DynamicLightEnlighten/bdd100k_seg/policy_model/image.Lab.seg_0.75_msn.act4_adapt20.0.55.clip5.5_entropy0.3_lr3e3.800epoch_2019-04-02-03-48/model_best.pth.tar")
    # policy.load_state_dict(checkpoint['state_dict'])
    policy.eval()
else:
    # params_list = [{'params': policy.vgg.parameters(), 'lr': lr},]
    # params_list.append({'params': policy.rnn.parameters(), 'lr': lr*10})
    # params_list = [{'params': policy.resnet.parameters(), 'lr': lr},]
    ##################################
    # params_list = [{'params': policy.fcn.pretrained.parameters(), 'lr': lr*10},
    #                 {'params': policy.fcn.head.parameters(), 'lr': lr*10}]
    ##################################
Ejemplo n.º 15
0
seg = DataParallelModel(seg).cuda()
seg.eval()
if opt_seg.ft:
    checkpoint = torch.load(opt_seg.ft_resume)
    seg.module.load_state_dict(checkpoint['state_dict'], strict=False)
    # self.logger.info("=> loaded checkpoint '{}' (epoch {})".format(args.ft_resume, checkpoint['epoch']))
####################################################

##### Policy #################################################
from models.policy import Policy

lr = 3e-4
gamma = 1
lambd_entropy = 0.3
# policy = Policy(hidden_dim=2, rnn_type='lstm')
policy = Policy(hidden_dim=4, input_dim=23, rnn_type=None)
r_neg = 5
r_pos = 5

if evaluation:
    checkpoint = torch.load(
        "/home/chenwy/DynamicLightEnlighten/bdd100k_seg/policy_model/image_lstm.vgg.avgpool.argmax_delta.clip1.2.action-mean0.975_entropy.0_gamma.1_lr1e4_update.5_2019-02-01-13-05/model_best.pth.tar"
    )
    policy.load_state_dict(checkpoint['state_dict'])
    policy.eval()
else:
    # params_list = [{'params': policy.vgg.parameters(), 'lr': lr},]
    # params_list.append({'params': policy.rnn.parameters(), 'lr': lr*10})
    # params_list = [{'params': policy.resnet.parameters(), 'lr': lr},]
    ##################################
    # params_list = [{'params': policy.fcn.pretrained.parameters(), 'lr': lr*10},