Exemple #1
0
    def __init__(self, policy_model, input_shape, action_size, pixel_control=True, RP=1.0, PC=1.0, VR=1.0, entropy_coeff=0.001, value_coeff=0.5,
                    lr=1e-3, lr_final=1e-4, decay_steps=50e6, grad_clip=0.5, policy_args={}, optim=torch.optim.RMSprop, device='cuda', optim_args={}):
        super(UnrealA2C2, self).__init__()
        self.RP, self.PC, self.VR = RP, PC, VR
        self.lr = lr
        self.entropy_coeff, self.value_coeff = entropy_coeff, value_coeff
        self.pixel_control = pixel_control
        self.grad_clip = grad_clip
        self.action_size = action_size
        self.device = device

        try:
            iterator = iter(input_shape)
        except TypeError:
            input_size = (input_shape,)
        
        self.policy = ActorCritic(policy_model, input_shape, action_size, entropy_coeff=entropy_coeff, value_coeff=value_coeff, 
                                  build_optimiser=False, device=device, **policy_args)

        

        if pixel_control:
            self.feat_map = torch.nn.Sequential(torch.nn.Linear(self.policy.dense_size, 32*8*8), torch.nn.ReLU()).to(device)
            self.deconv1 = torch.nn.Sequential(torch.nn.ConvTranspose2d(32, 32, kernel_size=[3,3], stride=[1,1]), torch.nn.ReLU()).to(device)
            self.deconv_advantage = torch.nn.ConvTranspose2d(32, action_size, kernel_size=[3,3], stride=[2,2]).to(device)
            self.deconv_value = torch.nn.ConvTranspose2d(32, 1, kernel_size=[3,3], stride=[2,2]).to(device)
                
        # reward model
        self.r1 = torch.nn.Sequential(torch.nn.Linear(self.policy.dense_size, 128), torch.nn.ReLU()).to(device)
        self.r2 = torch.nn.Linear(128, 3).to(device)

        self.optimiser = optim(self.parameters(), lr, **optim_args)
        self.scheduler = polynomial_sheduler(self.optimiser, lr_final, decay_steps, power=1)
Exemple #2
0
    def __init__(self,
                 policy_model,
                 target_model,
                 input_size,
                 action_size,
                 entropy_coeff=0.001,
                 intr_coeff=0.5,
                 extr_coeff=1.0,
                 lr=1e-4,
                 lr_final=0,
                 decay_steps=1e5,
                 grad_clip=0.5,
                 policy_clip=0.1,
                 policy_args={},
                 RND_args={},
                 optim=torch.optim.Adam,
                 optim_args={},
                 device='cuda'):
        super(RND, self).__init__()
        self.intr_coeff = intr_coeff
        self.extr_coeff = extr_coeff
        self.entropy_coeff = entropy_coeff
        self.lr = lr
        self.grad_clip = grad_clip
        self.action_size = action_size
        self.device = device

        target_size = (1, input_size[1], input_size[2]) if len(
            input_size
        ) == 3 else input_size  # only use last frame in frame-stack for convolutions

        self.policy = PPOIntrinsic(policy_model,
                                   input_size,
                                   action_size,
                                   lr,
                                   lr_final,
                                   decay_steps,
                                   grad_clip,
                                   entropy_coeff=entropy_coeff,
                                   policy_clip=policy_clip,
                                   extr_coeff=extr_coeff,
                                   intr_coeff=intr_coeff,
                                   device=device,
                                   build_optimiser=False,
                                   **policy_args)

        # randomly weighted and fixed neural network, acts as a random_id for each state
        self.target_model = target_model(target_size,
                                         trainable=False).to(device)

        # learns to predict target model
        # i.e. provides rewards based ability to predict a fixed random function, thus behaves as density map of explored areas
        self.predictor_model = target_model(target_size,
                                            trainable=True).to(device)

        self.optimiser = optim(self.parameters(), lr, **optim_args)
        self.scheduler = polynomial_sheduler(self.optimiser,
                                             lr_final,
                                             decay_steps,
                                             power=1)
    def __init__(self,
                 model,
                 input_shape,
                 action_size,
                 lr=1e-3,
                 lr_final=0,
                 decay_steps=6e5,
                 grad_clip=0.5,
                 build_optimiser=True,
                 optim=torch.optim.Adam,
                 optim_args={},
                 device='cuda',
                 **model_args):
        super(ValueModel, self).__init__()
        self.lr = lr
        self.lr_final = lr_final
        self.action_size = action_size
        self.decay_steps = decay_steps
        self.grad_clip = grad_clip
        self.device = device

        self.model = model(input_shape, **model_args).to(self.device)
        dense_size = self.model.dense_size
        self.V = torch.nn.Linear(dense_size, 1).to(self.device)

        if build_optimiser:
            self.optimiser = optim(self.parameters(), lr, **optim_args)
            self.scheduler = polynomial_sheduler(self.optimiser,
                                                 lr_final,
                                                 decay_steps,
                                                 power=1)
Exemple #4
0
    def __init__(self,
                 policy_model,
                 ICM_model,
                 input_size,
                 action_size,
                 forward_coeff,
                 policy_importance,
                 reward_scale,
                 entropy_coeff,
                 value_coeff=0.5,
                 lr=1e-3,
                 lr_final=1e-3,
                 decay_steps=6e5,
                 grad_clip=0.5,
                 policy_args={},
                 ICM_args={},
                 device='cuda'):
        super(Curiosity, self).__init__()
        self.reward_scale, self.forward_coeff, self.policy_importance, self.entropy_coeff = reward_scale, forward_coeff, policy_importance, entropy_coeff
        self.lr, self.lr_final, self.decay_steps = lr, lr_final, decay_steps
        self.grad_clip = grad_clip
        self.action_size = action_size
        self.device = device

        try:
            iterator = iter(input_size)
        except TypeError:
            input_size = (input_size, )

        self.ICM = ICM(ICM_model,
                       input_size,
                       action_size,
                       forward_coeff,
                       device=device,
                       **ICM_args)
        self.AC = ActorCritic(policy_model,
                              input_size,
                              action_size,
                              entropy_coeff,
                              value_coeff,
                              lr,
                              lr_final,
                              decay_steps,
                              grad_clip,
                              build_optimiser=False,
                              device=device,
                              **policy_args)

        self.optimiser = torch.optim.RMSprop(self.parameters(), lr=lr)
        self.scheduler = polynomial_sheduler(self.optimiser,
                                             lr_final,
                                             decay_steps,
                                             power=1)
Exemple #5
0
    def __init__(self,
                 model,
                 input_size,
                 action_size,
                 lr=1e-3,
                 lr_final=0,
                 decay_steps=6e5,
                 grad_clip=0.5,
                 entropy_coeff=0.01,
                 policy_clip=0.1,
                 extr_coeff=2.0,
                 intr_coeff=1.0,
                 build_optimiser=True,
                 optim=torch.optim.Adam,
                 optim_args={},
                 device='cuda',
                 **model_args):
        super(PPOIntrinsic, self).__init__()
        self.action_size = action_size
        self.input_size = input_size

        self.lr = lr
        self.lr_final = lr_final
        self.decay_steps = decay_steps
        self.grad_clip = grad_clip

        self.entropy_coeff = entropy_coeff
        self.policy_clip = policy_clip
        self.extr_coeff = extr_coeff
        self.intr_coeff = intr_coeff

        self.device = device

        self.model = model(input_size, **model_args).to(self.device)
        self.dense_size = dense_size = self.model.dense_size
        self.policy = torch.nn.Sequential(
            torch.nn.Linear(dense_size, action_size),
            torch.nn.Softmax(dim=-1)).to(self.device)  # Actor
        self.Ve = torch.nn.Linear(dense_size,
                                  1).to(self.device)  # Critic (Extrinsic)
        self.Vi = torch.nn.Linear(dense_size, 1).to(
            self.device
        )  # Intrinsic Value i.e. expected instrinsic value of state

        if build_optimiser:
            self.optimiser = optim(self.parameters(), lr, **optim_args)
            self.scheduler = polynomial_sheduler(self.optimiser,
                                                 lr_final,
                                                 decay_steps,
                                                 power=1)
Exemple #6
0
    def __init__(self,
                 model,
                 input_size,
                 action_size,
                 cell_size,
                 entropy_coeff=0.01,
                 value_coeff=0.5,
                 lr=1e-3,
                 lr_final=1e-6,
                 decay_steps=6e5,
                 grad_clip=0.5,
                 build_optimiser=True,
                 optim=torch.optim.RMSprop,
                 optim_args={},
                 device='cuda',
                 **model_args):
        super(ActorCritic_LSTM, self).__init__()
        self.lr = lr
        self.lr_final = lr_final
        self.input_size = input_size
        self.entropy_coeff = entropy_coeff
        self.value_coeff = value_coeff
        self.decay_steps = decay_steps
        self.grad_clip = grad_clip
        self.cell_size = cell_size
        self.action_size = action_size
        self.device = device

        self.model = model(input_size, **model_args).to(self.device)
        self.dense_size = self.model.dense_size
        #self.lstm = MaskedRNN(MaskedLSTMCell(cell_size, self.dense_size), time_major=True)
        self.lstm = MaskedLSTMBlock(self.dense_size,
                                    cell_size,
                                    time_major=True).to(self.device)

        self.policy_distrib = torch.nn.Linear(cell_size,
                                              action_size,
                                              device=self.device)  # Actor
        self.V = torch.nn.Linear(cell_size, 1, device=self.device)  # Critic

        if build_optimiser:
            self.optimiser = optim(self.parameters(), lr, **optim_args)
            self.scheduler = polynomial_sheduler(self.optimiser,
                                                 lr_final,
                                                 decay_steps,
                                                 power=1)
    def __init__(self,
                 model,
                 input_shape,
                 action_size,
                 lr=1e-3,
                 lr_final=0,
                 decay_steps=6e5,
                 grad_clip=0.5,
                 entropy_coeff=0.01,
                 policy_clip=0.1,
                 adv_coeff=0.25,
                 build_optimiser=True,
                 optim=torch.optim.Adam,
                 optim_args={},
                 device='cuda',
                 **model_args):
        super(PolicyModel, self).__init__()
        self.lr = lr
        self.lr_final = lr_final
        self.action_size = action_size
        self.entropy_coeff = entropy_coeff
        self.decay_steps = decay_steps
        self.grad_clip = grad_clip
        self.policy_clip = policy_clip
        self.adv_coeff = adv_coeff
        self.device = device

        self.model = model(input_shape, **model_args).to(self.device)
        dense_size = self.model.dense_size
        self.policy = torch.nn.Sequential(
            torch.nn.Linear(dense_size, action_size),
            torch.nn.Softmax(dim=-1)).to(self.device)
        self.Adv = torch.nn.Linear(dense_size, 1).to(self.device)

        if build_optimiser:
            self.optimiser = optim(self.parameters(), lr, **optim_args)
            self.scheduler = polynomial_sheduler(self.optimiser,
                                                 lr_final,
                                                 decay_steps,
                                                 power=1)
Exemple #8
0
    def __init__(self,
                 policy_model,
                 target_model,
                 input_size,
                 action_size,
                 pixel_control=True,
                 intr_coeff=0.5,
                 extr_coeff=1.0,
                 entropy_coeff=0.001,
                 policy_clip=0.1,
                 lr=1e-4,
                 lr_final=1e-5,
                 decay_steps=6e5,
                 grad_clip=0.5,
                 RP=1,
                 VR=1,
                 PC=1,
                 policy_args={},
                 RND_args={},
                 optim=torch.optim.Adam,
                 optim_args={},
                 device='cuda'):
        super(RANDAL, self).__init__()
        self.lr = lr
        self.entropy_coeff = entropy_coeff
        self.intr_coeff = intr_coeff
        self.extr_coeff = extr_coeff
        self.pixel_control = pixel_control
        self.grad_clip = grad_clip
        self.action_size = action_size
        self.device = device
        self.RP = RP  # reward prediction
        self.VR = VR  # value replay
        self.PC = PC  # pixel control

        self.policy = PPOIntrinsic(policy_model,
                                   input_size,
                                   action_size,
                                   lr=lr,
                                   lr_final=lr_final,
                                   decay_steps=decay_steps,
                                   grad_clip=grad_clip,
                                   entropy_coeff=entropy_coeff,
                                   policy_clip=policy_clip,
                                   extr_coeff=extr_coeff,
                                   intr_coeff=intr_coeff,
                                   build_optimiser=False,
                                   **policy_args)

        target_size = (1, input_size[1], input_size[2]) if len(
            input_size
        ) == 3 else input_size  # only use last frame in frame-stack for convolutions

        # randomly weighted and fixed neural network, acts as a random_id for each state
        self.target_model = target_model(target_size,
                                         trainable=False,
                                         **RND_args).to(device)

        # learns to predict target model
        # i.e. provides rewards based ability to predict a fixed random function, thus behaves as density map of explored areas
        self.predictor_model = target_model(target_size,
                                            trainable=True,
                                            **RND_args).to(device)

        self.optimiser = optim(self.parameters(), lr, **optim_args)
        self.scheduler = polynomial_sheduler(self.optimiser,
                                             lr_final,
                                             decay_steps,
                                             power=1)

        if pixel_control:
            self.feat_map = torch.nn.Sequential(
                torch.nn.Linear(self.policy.dense_size, 32 * 8 * 8),
                torch.nn.ReLU()).to(device)
            self.deconv1 = torch.nn.Sequential(
                torch.nn.ConvTranspose2d(32,
                                         32,
                                         kernel_size=[3, 3],
                                         stride=[1, 1]),
                torch.nn.ReLU()).to(device)
            self.deconv_advantage = torch.nn.ConvTranspose2d(
                32, action_size, kernel_size=[3, 3], stride=[2, 2]).to(device)
            self.deconv_value = torch.nn.ConvTranspose2d(32,
                                                         1,
                                                         kernel_size=[3, 3],
                                                         stride=[2,
                                                                 2]).to(device)

        # reward model
        self.r1 = torch.nn.Sequential(
            torch.nn.Linear(self.policy.dense_size, 128),
            torch.nn.ReLU()).to(device)
        self.r2 = torch.nn.Linear(128, 3).to(device)

        self.optimiser = optim(self.parameters(), lr, **optim_args)
        self.scheduler = polynomial_sheduler(self.optimiser,
                                             lr_final,
                                             decay_steps,
                                             power=1)