Beispiel #1
0
    def __init__(self, config, action_mask):
        super(CL_DPG, self).__init__(config)

        # Set Hyper-parameters

        self.initial_phase = not config.true_embeddings and not config.load_embed and not config.restore  # Initial training phase required if learning embeddings
        self.batch_norm = False

        # Function to get state features and action representation
        self.state_features = Basis.get_Basis(config=config)
        self.action_rep = CL_ActionRepresentation.VAE_Action_representation(
            action_dim=self.action_dim,
            state_dim=self.state_features.feature_dim,
            config=config)
        # Create instances for Actor and Q_fn
        self.actor = Actor(action_dim=self.action_rep.reduced_action_dim,
                           state_dim=self.state_features.feature_dim,
                           config=config)
        self.Q = Q_fn(action_dim=self.action_rep.reduced_action_dim,
                      state_dim=self.state_features.feature_dim,
                      config=config)

        # Create target networks
        # Deepcopy not working.
        self.target_state_features = Basis.get_Basis(config=config)
        self.target_actor = Actor(
            action_dim=self.action_rep.reduced_action_dim,
            state_dim=self.state_features.feature_dim,
            config=config)
        self.target_Q = Q_fn(action_dim=self.action_rep.reduced_action_dim,
                             state_dim=self.state_features.feature_dim,
                             config=config)
        # self.target_action_rep = ActionRepresentation.Action_representation_deep(action_dim=self.action_dim, config=config)
        # Copy the initialized values to target
        self.target_state_features.load_state_dict(
            self.state_features.state_dict())
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_Q.load_state_dict(self.Q.state_dict())
        # self.target_action_rep.load_state_dict(self.action_rep.state_dict())

        self.memory = MemoryBuffer(
            max_len=self.config.buffer_size,
            state_dim=self.state_dim,
            action_dim=1,
            atype=long,
            config=config,
            dist_dim=self.action_rep.reduced_action_dim)  # off-policy
        self.noise = OrnsteinUhlenbeckActionNoise(
            self.config.reduced_action_dim)

        self.modules = [('actor', self.actor), ('Q', self.Q),
                        ('state_features', self.state_features),
                        ('action_rep', self.action_rep),
                        ('target_actor', self.target_actor),
                        ('target_state_features', self.target_state_features),
                        ('target_Q', self.target_Q)]  #,
        # ('target_action_rep', self.target_action_rep)]

        self.init()
        self.update_mask(action_mask=action_mask)
    def __init__(self, config, action_mask):
        super(CL_ActorCritic, self).__init__(config)

        # Initial training phase required if learning embeddings from scratch
        self.initial_phase = not config.true_embeddings and not config.load_embed

        # Function to get state features and action representation
        self.state_features = Basis.get_Basis(config=config)
        self.action_rep = CL_ActionRepresentation.VAE_Action_representation(state_dim=self.state_features.feature_dim,
                                                                     action_dim=self.action_dim, config=config)

        # Create instances for Actor and Q_fn
        self.critic = Critic.Critic_with_traces(state_dim=self.state_features.feature_dim, config=config)
        self.actor = Policy.embed_Gaussian(action_dim=self.action_rep.reduced_action_dim,
                                           state_dim=self.state_features.feature_dim, config=config)

        # Initialize storage containers
        self.memory =   MemoryBuffer(max_len=self.config.buffer_size, state_dim=self.state_dim,
                                     action_dim=1, atype=long, config=config,
                                     dist_dim=self.action_rep.reduced_action_dim)  # off-policy
        self.trajectory = Trajectory(max_len=self.config.batch_size, state_dim=self.state_dim,
                                     action_dim=1, atype=long, config=config,
                                     dist_dim=self.action_rep.reduced_action_dim)  # on-policy

        self.modules = [('actor', self.actor), ('critic', self.critic),
                        ('state_features', self.state_features), ('action_rep', self.action_rep)]

        self.init()
        self.update_mask(action_mask=action_mask)
Beispiel #3
0
    def __init__(self, config):
        super(embed_Reinforce, self).__init__(config)

        self.ep_rewards = []
        self.ep_states = []
        self.ep_actions = []
        self.ep_exec_action_embs = []
        self.ep_chosen_action_embs = []

        # Set Hyper-parameters
        self.memory = MemoryBuffer(size=config.buffer_size)
        self.counter = 0

        self.initial_phase = not config.true_embeddings  # Initial training phase required if learning embeddings

        # Function to get state features and action representation
        if config.fourier_order > 0:
            self.state_features = Basis.Fourier_Basis(config=config)
        else:
            self.state_features = Basis.NN_Basis(config=config)

        # Function to get state features and action representation
        self.action_rep = Action_representation(
            state_dim=self.state_features.feature_dim,
            action_dim=self.action_dim,
            config=config)
        self.baseline = Critic.Critic(
            state_dim=self.state_features.feature_dim, config=config)

        # Create instances for Actor and Q_fn
        self.atype = config.dtype
        self.actor = Policy.embed_Gaussian(
            action_dim=self.action_rep.reduced_action_dim,
            state_dim=self.state_features.feature_dim,
            config=config)
        self.action_size = self.action_dim

        self.modules = [('actor', self.actor), ('baseline', self.baseline),
                        ('state_features', self.state_features),
                        ('action_rep', self.action_rep)]

        self.init()
Beispiel #4
0
class embed_Reinforce(Agent):
    def __init__(self, config):
        super(embed_Reinforce, self).__init__(config)

        self.ep_rewards = []
        self.ep_states = []
        self.ep_actions = []
        self.ep_exec_action_embs = []
        self.ep_chosen_action_embs = []

        # Set Hyper-parameters
        self.memory = MemoryBuffer(size=config.buffer_size)
        self.counter = 0

        self.initial_phase = not config.true_embeddings  # Initial training phase required if learning embeddings

        # Function to get state features and action representation
        if config.fourier_order > 0:
            self.state_features = Basis.Fourier_Basis(config=config)
        else:
            self.state_features = Basis.NN_Basis(config=config)

        # Function to get state features and action representation
        self.action_rep = Action_representation(
            state_dim=self.state_features.feature_dim,
            action_dim=self.action_dim,
            config=config)
        self.baseline = Critic.Critic(
            state_dim=self.state_features.feature_dim, config=config)

        # Create instances for Actor and Q_fn
        self.atype = config.dtype
        self.actor = Policy.embed_Gaussian(
            action_dim=self.action_rep.reduced_action_dim,
            state_dim=self.state_features.feature_dim,
            config=config)
        self.action_size = self.action_dim

        self.modules = [('actor', self.actor), ('baseline', self.baseline),
                        ('state_features', self.state_features),
                        ('action_rep', self.action_rep)]

        self.init()

    def get_action(self, state, explore=0.2):
        explore = 0  #Don't do eps-greedy with policy gradients.
        if self.initial_phase or np.random.rand() < explore:
            # take random actions (uniformly in actual action space) to observe the interactions initially
            action = np.random.randint(self.action_dim)
            exec_action_emb = self.action_rep.get_embedding(action).cpu().view(
                -1).data.numpy()
            chosen_action_emb = exec_action_emb
        else:
            state = np.float32(state)
            if len(state.shape) == 1:
                state = np.expand_dims(state, 0)

            state = self.state_features.forward(state)
            chosen_action_emb = self.actor.get_action_wo_dist(state, explore=0)
            action = self.action_rep.get_best_match(chosen_action_emb)

            exec_action_emb = self.action_rep.get_embedding(action).cpu().view(
                -1).data.numpy()
            chosen_action_emb = chosen_action_emb.cpu().view(-1).data.numpy()

        return action, (exec_action_emb, chosen_action_emb)

    def update(self, s1, a1, a_emb1, r1, s2, done):
        if not self.initial_phase:
            # Store the episode history
            self.ep_rewards.append(r1)
            self.ep_states.append(s1)
            self.ep_actions.append(int(a1))
            self.ep_exec_action_embs.append(a_emb1[0])
            self.ep_chosen_action_embs.append(a_emb1[1])
            if done:
                # Compute gamma return and do on-policy update
                g_rewards, R = [], 0
                for r in self.ep_rewards[::-1]:
                    R = r + self.config.gamma * R
                    g_rewards.insert(0, R)
                self.optimize(np.float32(self.ep_states),
                              np.float32(self.ep_actions),
                              np.float32(self.ep_exec_action_embs),
                              np.float32(self.ep_chosen_action_embs),
                              np.float32(g_rewards))

                # Reset the episode history
                self.ep_rewards = []
                self.ep_states = []
                self.ep_actions = []
                self.ep_exec_action_embs = []
                self.ep_chosen_action_embs = []

        else:
            self.memory.add(s1,
                            a1,
                            a_emb1[0],
                            r1,
                            s2,
                            int(done != 1),
                            randomize=True)  # a_emb1 gets ignored subsequently
            if self.memory.length >= self.config.buffer_size:
                # action embeddings can be learnt offline
                self.initial_phase_training(
                    max_epochs=self.config.initial_phase_epochs)

    def optimize(self, s1, a1, exec_a1_emb, chosen_a1_emb, r1):
        r1 = Variable(torch.from_numpy(r1).type(self.config.dtype),
                      requires_grad=False).view(-1, 1)
        exec_a1_emb = Variable(torch.from_numpy(exec_a1_emb).type(
            self.config.dtype),
                               requires_grad=False)
        chosen_a1_emb = Variable(torch.from_numpy(chosen_a1_emb).type(
            self.config.dtype),
                                 requires_grad=False)

        a1_emb = exec_a1_emb if self.config.emb_flag == 'exec' else chosen_a1_emb

        s1 = self.state_features.forward(s1)

        # ---------------------- optimize critic ----------------------
        val_pred = self.baseline.forward(s1)
        # loss_baseline = F.smooth_l1_loss(val_pred, r1)
        loss_baseline = F.mse_loss(val_pred, r1)

        # ---------------------- optimize actor ----------------------
        td_error = (r1 - val_pred).detach()

        if self.config.TIS:
            _, dist = self.actor.get_action(s1)
            exec_prob = self.actor.get_prob_from_dist(
                dist, exec_a1_emb, scalar=self.config.TIS_scalar)
            chosen_prob = self.actor.get_prob_from_dist(
                dist, chosen_a1_emb, scalar=self.config.TIS_scalar)
            TIS_ratio = (exec_prob /
                         chosen_prob).detach()  #TODO: clip this ratio?
            loss_actor = -1.0 * torch.mean(
                TIS_ratio * td_error *
                self.actor.get_log_prob_dist(dist, exec_a1_emb))
        else:
            loss_actor = -1.0 * torch.mean(
                td_error * self.actor.get_log_prob(s1, a1_emb))

        # loss_actor = -1.0 * torch.sum(td_error * self.actor.get_log_prob(s1, a1_emb))
        # loss_actor = -1.0 * torch.mean(torch.mean(r1 * self.actor.get_log_prob(s1, a1_emb), -1)) # without baseline
        loss = loss_baseline + loss_actor
        # print(val_pred, a1_emb)

        # ------------ optimize the embeddings always ----------------
        if not self.config.true_embeddings:
            a1 = Variable(torch.from_numpy(a1).type(self.config.dtype_long),
                          requires_grad=False)
            action_pred = self.action_rep.forward(s1[:-1], s1[1:])
            loss_act_rep = F.cross_entropy(action_pred, a1[:-1])
            loss += loss_act_rep * self.config.emb_lambda

        self.step(loss, clip_norm=10)

    def initial_phase_training(self, max_epochs=-1):
        # change optimizer to Adam for supervised learning
        self.action_rep.optim = torch.optim.Adam(self.action_rep.parameters(),
                                                 lr=1e-3)
        self.state_features.optim = torch.optim.Adam(
            self.state_features.parameters(), lr=1e-3)
        initial_losses = []

        print("Inital training phase started...")
        #TODO: Split into train and validation to avoid overfitting
        for counter in range(max_epochs):
            losses = []
            for s1, a1, _, _, s2, _ in self.memory.get_batch(
                    size=self.config.sup_batch_size, randomize=True):
                a1 = Variable(torch.from_numpy(a1).type(
                    self.config.dtype_long),
                              requires_grad=False)

                self.clear_gradients()  # clear all the gradients from last run

                s1 = self.state_features.forward(s1)
                s2 = self.state_features.forward(s2)

                # ------------ optimize the embeddings ----------------
                action_pred = self.action_rep.forward(s1, s2)
                loss_act_rep = F.cross_entropy(action_pred, a1)

                loss_act_rep.backward()
                self.action_rep.optim.step()
                self.state_features.optim.step()

                losses.append(loss_act_rep.cpu().view(-1).data.numpy()[0])

            # print(np.mean(loss))
            initial_losses.append(np.mean(losses))
            if counter % 1 == 0:
                print("Epoch {} loss:: {}".format(
                    counter, np.mean(initial_losses[-10:])))
                #self.save()

            # Terminate initial phase once action representations have converged.
            if len(initial_losses) >= 20 and np.mean(
                    initial_losses[-10:]) >= np.mean(initial_losses[-20:]):
                print("Converged...")
                break

        # Reset the optim to whatever is there in config
        self.action_rep.optim = self.config.optim(self.action_rep.parameters(),
                                                  lr=self.config.embed_lr)
        self.state_features.optim = self.config.optim(
            self.state_features.parameters(), lr=self.config.state_lr)

        print('... Initial training phase terminated!')
        self.initial_phase = False
Beispiel #5
0
class CL_DPG(Agent):
    # @profile
    def __init__(self, config, action_mask):
        super(CL_DPG, self).__init__(config)

        # Set Hyper-parameters

        self.initial_phase = not config.true_embeddings and not config.load_embed and not config.restore  # Initial training phase required if learning embeddings
        self.batch_norm = False

        # Function to get state features and action representation
        self.state_features = Basis.get_Basis(config=config)
        self.action_rep = CL_ActionRepresentation.VAE_Action_representation(
            action_dim=self.action_dim,
            state_dim=self.state_features.feature_dim,
            config=config)
        # Create instances for Actor and Q_fn
        self.actor = Actor(action_dim=self.action_rep.reduced_action_dim,
                           state_dim=self.state_features.feature_dim,
                           config=config)
        self.Q = Q_fn(action_dim=self.action_rep.reduced_action_dim,
                      state_dim=self.state_features.feature_dim,
                      config=config)

        # Create target networks
        # Deepcopy not working.
        self.target_state_features = Basis.get_Basis(config=config)
        self.target_actor = Actor(
            action_dim=self.action_rep.reduced_action_dim,
            state_dim=self.state_features.feature_dim,
            config=config)
        self.target_Q = Q_fn(action_dim=self.action_rep.reduced_action_dim,
                             state_dim=self.state_features.feature_dim,
                             config=config)
        # self.target_action_rep = ActionRepresentation.Action_representation_deep(action_dim=self.action_dim, config=config)
        # Copy the initialized values to target
        self.target_state_features.load_state_dict(
            self.state_features.state_dict())
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_Q.load_state_dict(self.Q.state_dict())
        # self.target_action_rep.load_state_dict(self.action_rep.state_dict())

        self.memory = MemoryBuffer(
            max_len=self.config.buffer_size,
            state_dim=self.state_dim,
            action_dim=1,
            atype=long,
            config=config,
            dist_dim=self.action_rep.reduced_action_dim)  # off-policy
        self.noise = OrnsteinUhlenbeckActionNoise(
            self.config.reduced_action_dim)

        self.modules = [('actor', self.actor), ('Q', self.Q),
                        ('state_features', self.state_features),
                        ('action_rep', self.action_rep),
                        ('target_actor', self.target_actor),
                        ('target_state_features', self.target_state_features),
                        ('target_Q', self.target_Q)]  #,
        # ('target_action_rep', self.target_action_rep)]

        self.init()
        self.update_mask(action_mask=action_mask)

    def update_mask(self, action_mask):
        self.action_mask = action_mask
        self.curr_action_set = np.where(self.action_mask)[0]
        self.action_rep.update_mask(self.action_mask)

    # Overrides the reset function in parent class
    def reset(self, action_mask, change_flag):
        for _, module in self.modules:
            module.reset()

        if change_flag:
            if self.config.re_init == 'full':
                # Do a complete re initialization after the MDP has changed
                self.__init__(self.config, action_mask)

            self.update_mask(action_mask)
            self.initial_phase = True
            self.memory.reset()

    def get_action(self, state, explore=0):
        if self.batch_norm:
            self.actor.eval(
            )  # Set the actor to Evaluation mode. Required for Batchnorm

        if self.initial_phase:
            # take random actions (uniformly in actual action space) to observe the interactions initially
            action = np.random.choice(self.curr_action_set)
            action_emb = self.action_rep.get_embedding(action).cpu().view(
                -1).data.numpy()

        else:
            state = tensor(state,
                           dtype=float32,
                           requires_grad=False,
                           device=self.config.device).view(1, -1)
            state = self.state_features.forward(state)
            action_emb = self.actor.get_action(state)

            noise = self.noise.sample() * explore  #* 0.1
            action_emb += Variable(torch.from_numpy(noise).type(float32),
                                   requires_grad=False)

            action = self.action_rep.get_best_match(action_emb)
            action_emb = action_emb.cpu().view(-1).data.numpy()

        self.track_entropy_cont(action_emb)
        return action, action_emb

    def update(self, s1, a1, a_emb1, r1, s2, done):
        self.memory.add(s1, a1, a_emb1, r1, s2, int(done != 1))
        if self.initial_phase and self.memory.length >= self.config.buffer_size:
            self.initial_phase_training(
                max_epochs=self.config.initial_phase_epochs)
        elif not self.initial_phase and self.memory.length > self.config.sup_batch_size:
            self.optimize()

    def optimize(self):
        if self.batch_norm:
            self.actor.train(
            )  # Set the actor to training mode. Required for Batchnorm

        s1, a1, a1_emb, r1, s2, not_absorbing = self.memory.sample(
            self.config.sup_batch_size)

        # ---------------------- optimize critic ----------------------
        # Use target actor exploitation policy here for loss evaluation
        s2_t = self.target_state_features.forward(s2).detach()
        a2_emb = self.target_actor.get_action(
            s2_t).detach()  # Detach targets from grad computation.
        next_val = self.target_Q.forward(
            s2_t, a2_emb).detach()  # Compute Q'( s2, pi'(s2))
        val_exp = r1 + self.config.gamma * next_val * not_absorbing  # y_exp = r + gamma * Q'( s2, pi'(s2))

        s1_ = self.state_features.forward(s1)
        val_pred = self.Q.forward(s1_, a1_emb)  # y_pred = Q( s1, a1)
        loss_Q = F.mse_loss(val_pred, val_exp)
        # loss_Q = F.smooth_l1_loss(val_pred, val_exp)                    # compute critic loss

        self.clear_gradients()
        loss_Q.backward()
        self.Q.optim.step()
        self.state_features.optim.step()

        # ---------------------- optimize actor ----------------------
        s1_ = self.state_features.forward(s1)
        s2_ = self.state_features.forward(s2)
        pred_a1_emb = self.actor.get_action(s1_)
        loss_actor = -1.0 * torch.mean(self.Q.forward(s1_, pred_a1_emb))
        loss_rep = self.action_rep.unsupervised_loss(
            s1_, a1.view(-1), s2_) * self.config.emb_lambda

        loss = loss_actor + loss_rep
        self.clear_gradients()
        loss.backward()
        self.actor.optim.step()
        self.action_rep.optim.step()
        self.state_features.optim.step()

        # ------------ update target actor and critic -----------------
        soft_update(self.target_actor, self.actor, self.config.tau)
        soft_update(self.target_Q, self.Q, self.config.tau)
        soft_update(self.target_state_features, self.state_features,
                    self.config.tau)

    def self_supervised_update(self, s1, a1, s2, reg=1):
        s1 = self.state_features(s1)
        s2 = self.state_features(s2)

        loss = self.action_rep.unsupervised_loss(s1, a1.view(-1), s2) * reg

        self.clear_gradients()
        loss.backward()
        self.action_rep.optim.step()
        self.state_features.optim.step()

        return loss.item()

    def clear_gradients(self):
        for module in [
                self.action_rep, self.actor, self.Q, self.state_features
        ]:
            module.optim.zero_grad()

    def initial_phase_training(self, max_epochs=-1):
        if self.batch_norm:
            self.actor.train(
            )  # Set the actor to training mode. Required for Batchnorm

        # change optimizer to Adam for unsupervised learning
        self.action_rep.optim = torch.optim.Adam(self.action_rep.parameters(),
                                                 lr=1e-2)
        self.state_features.optim = torch.optim.Adam(
            self.state_features.parameters(), lr=1e-2)
        initial_losses = []

        print("Inital training phase started...")
        for counter in range(max_epochs):
            losses = []
            for s1, a1, _, _, s2, _ in self.memory.batch_sample(
                    batch_size=self.config.sup_batch_size, randomize=True):
                loss = self.self_supervised_update(s1, a1, s2)
                losses.append(loss)

            initial_losses.append(np.mean(losses))
            if counter % 1 == 0:
                print("Epoch {} loss:: {}".format(
                    counter, np.mean(initial_losses[-10:])))
                if self.config.only_phase_one:
                    self.save()
                    print("Saved..")

            # Terminate initial phase once action representations have converged.
            if len(initial_losses) >= 20 and np.mean(
                    initial_losses[-10:]) + 1e-5 >= np.mean(
                        initial_losses[-20:]):
                print("Converged...")
                break

        # Reset the optim to whatever is there in config
        self.action_rep.optim = self.config.optim(self.action_rep.parameters(),
                                                  lr=self.config.embed_lr)
        self.state_features.optim = self.config.optim(
            self.state_features.parameters(), lr=self.config.state_lr)

        print('... Initial training phase terminated!')
        self.initial_phase = False
        self.save()

        if self.config.only_phase_one:
            exit()

        hard_update(self.target_state_features, self.state_features)
class CL_ActorCritic(Agent):
    def __init__(self, config, action_mask):
        super(CL_ActorCritic, self).__init__(config)

        # Initial training phase required if learning embeddings from scratch
        self.initial_phase = not config.true_embeddings and not config.load_embed

        # Function to get state features and action representation
        self.state_features = Basis.get_Basis(config=config)
        self.action_rep = CL_ActionRepresentation.VAE_Action_representation(state_dim=self.state_features.feature_dim,
                                                                     action_dim=self.action_dim, config=config)

        # Create instances for Actor and Q_fn
        self.critic = Critic.Critic_with_traces(state_dim=self.state_features.feature_dim, config=config)
        self.actor = Policy.embed_Gaussian(action_dim=self.action_rep.reduced_action_dim,
                                           state_dim=self.state_features.feature_dim, config=config)

        # Initialize storage containers
        self.memory =   MemoryBuffer(max_len=self.config.buffer_size, state_dim=self.state_dim,
                                     action_dim=1, atype=long, config=config,
                                     dist_dim=self.action_rep.reduced_action_dim)  # off-policy
        self.trajectory = Trajectory(max_len=self.config.batch_size, state_dim=self.state_dim,
                                     action_dim=1, atype=long, config=config,
                                     dist_dim=self.action_rep.reduced_action_dim)  # on-policy

        self.modules = [('actor', self.actor), ('critic', self.critic),
                        ('state_features', self.state_features), ('action_rep', self.action_rep)]

        self.init()
        self.update_mask(action_mask=action_mask)

    def update_mask(self, action_mask):
        self.action_mask = action_mask
        self.curr_action_set = np.where(self.action_mask)[0]
        self.action_rep.update_mask(self.action_mask)

    # Overrides the reset function in parent class
    def reset(self, action_mask, change_flag):
        for _, module in self.modules:
           module.reset()

        if change_flag:
            if self.config.re_init == 'full':
                # Do a complete re initialization after the MDP has changed
                self.__init__(self.config, action_mask)
            if self.config.re_init == 'policy':
                # Re-init only the policy, (state features and value functions can carry over from prv time)
                self.action_rep = CL_ActionRepresentation.Action_representation(
                    state_dim=self.state_features.feature_dim,
                    action_dim=self.action_dim, config=self.config)
                self.actor = Policy.embed_Gaussian(action_dim=self.action_rep.reduced_action_dim,
                                                   state_dim=self.state_features.feature_dim, config=self.config)

            self.update_mask(action_mask)
            self.initial_phase = True
            self.memory.reset()

    def get_action(self, state, explore=0):
        explore = 0  # Don't do eps-greedy with policy gradients
        if self.initial_phase or np.random.rand() < explore:
            # take random actions (uniformly in actual action space) to observe the interactions initially
            action = np.random.choice(self.curr_action_set)
            chosen_action_emb = self.action_rep.get_embedding(action).cpu().view(-1).data.numpy()

        else:
            state = tensor(state, dtype=float32, requires_grad=False, device=self.config.device).view(1, -1)
            state = self.state_features.forward(state)
            chosen_action_emb, _ = self.actor.get_action(state, explore=0)
            action = self.action_rep.get_best_match(chosen_action_emb)

            chosen_action_emb = chosen_action_emb.cpu().view(-1).data.numpy()

        return action, chosen_action_emb

    def update(self, s1, a1, a_emb1, r1, s2, done, debug=False):
        if not self.initial_phase:
            # On-policy episode history, # Dont use value predicted from the absorbing/goal state
            # self.optimize(s1, a1, a_emb1, r1, s2, int(done != 1))
            self.trajectory.add(s1, a1, a_emb1, r1, s2, int(done != 1))
            if self.trajectory.size >= self.config.batch_size or done:
                self.optimize(debug)
                self.trajectory.reset()
        else:
            # action embeddings can be learnt offline
            self.memory.add(s1, a1, a_emb1, r1, s2, int(done != 1))
            if self.memory.length >= self.config.buffer_size:
                self.initial_phase_training(max_epochs=self.config.initial_phase_epochs)

    def optimize(self, debug=False):
        s1, a1, chosen_a1_emb, r1, s2, not_absorbing = self.trajectory.get_all()

        s1 = self.state_features.forward(s1)
        s2 = self.state_features.forward(s2)

        # ---------------------- optimize critic ----------------------
        next_val = self.critic.forward(s2).detach()    # Detach targets from grad computation.
        val_exp  = r1 + self.config.gamma * next_val * not_absorbing
        val_pred = self.critic.forward(s1)
        loss_critic = F.mse_loss(val_pred, val_exp)

        # loss_critic = F.smooth_l1_loss(val_pred, val_exp)

        # ---------------------- optimize actor ----------------------
        td_error = (val_exp - val_pred).detach()
        logp, dist = self.actor.get_log_prob(s1, chosen_a1_emb)
        loss_actor = -1.0 * torch.mean(td_error * logp)
        # loss_actor += self.config.entropy_lambda * self.actor.get_entropy_from_dist(dist)

        # Take one policy gradient step
        loss = loss_critic + loss_actor

        # if not self.config.true_embeddings and self.config.emb_lambda > 0:
        #    loss += self.action_rep.unsupervised_loss(s1, a1.view(-1), s2) * self.config.emb_lambda

        self.step(loss, clip_norm=1)


    def self_supervised_update(self, s1, a1, s2, reg=1):
        self.clear_gradients()  # clear all the gradients from last run

        # If doing online updates, sharing the state features might be problematic!
        s1 = self.state_features.forward(s1)
        s2 = self.state_features.forward(s2)

        # ------------ optimize the embeddings ----------------
        loss_act_rep = self.action_rep.unsupervised_loss(s1, a1.view(-1), s2, normalized=True) * reg
        loss_act_rep.backward()

        # Directly call the optimizer's step fn to bypass lambda traces (if any)
        self.action_rep.optim.step()
        self.state_features.optim.step()

        return loss_act_rep.item()

    def initial_phase_training(self, max_epochs=-1):
        # change optimizer to Adam for unsupervised learning
        self.action_rep.optim = torch.optim.Adam(self.action_rep.parameters(), lr=1e-2)
        self.state_features.optim = torch.optim.Adam(self.state_features.parameters(), lr=1e-2)
        initial_losses = []

        print("Inital training phase started...")
        for counter in range(max_epochs):
            losses = []
            for s1, a1, _, _, s2, _ in self.memory.batch_sample(batch_size=self.config.sup_batch_size, randomize=True):
                loss = self.self_supervised_update(s1, a1, s2)
                losses.append(loss)

            initial_losses.append(np.mean(losses))
            if counter % 1 == 0:
                print("Epoch {} loss:: {}".format(counter, np.mean(initial_losses[-10:])))
                if self.config.only_phase_one:
                    self.save()
                    print("Saved..")

            # Terminate initial phase once action representations have converged.
            if len(initial_losses) >= 20 and np.mean(initial_losses[-10:]) + 1e-5 >= np.mean(initial_losses[-20:]):
                print("Converged...")
                break

        # Reset the optim to whatever is there in config
        self.action_rep.optim = self.config.optim(self.action_rep.parameters(), lr=self.config.embed_lr)
        self.state_features.optim = self.config.optim(self.state_features.parameters(), lr=self.config.state_lr)

        print('... Initial training phase terminated!')
        self.initial_phase = False
        self.save()

        if self.config.only_phase_one:
            exit()
class embed_ActorCritic(Agent):
    def __init__(self, config):
        super(embed_ActorCritic, self).__init__(config)

        # Initial training phase required if learning embeddings from scratch
        self.initial_phase = not config.true_embeddings and not config.load_embed

        # Function to get state features and action representation
        self.state_features = Basis.get_Basis(config=config)
        self.action_rep = ActionRepresentation.Action_representation(
            state_dim=self.state_features.feature_dim,
            action_dim=self.action_dim,
            config=config)

        # Create instances for Actor and Q_fn
        self.critic = Critic.Critic_with_traces(
            state_dim=self.state_features.feature_dim, config=config)
        self.actor = Policy.embed_Gaussian(
            action_dim=self.action_rep.reduced_action_dim,
            state_dim=self.state_features.feature_dim,
            config=config)

        # Initialize storage containers
        self.memory = MemoryBuffer(
            max_len=self.config.buffer_size,
            state_dim=self.state_dim,
            action_dim=1,
            atype=long,
            config=config,
            dist_dim=self.action_rep.reduced_action_dim)  # off-policy
        self.trajectory = Trajectory(
            max_len=self.config.batch_size,
            state_dim=self.state_dim,
            action_dim=1,
            atype=long,
            config=config,
            dist_dim=self.action_rep.reduced_action_dim)  # on-policy

        self.modules = [('actor', self.actor), ('critic', self.critic),
                        ('state_features', self.state_features),
                        ('action_rep', self.action_rep)]
        self.init()

        # If needed later:
        # If the embeddings are going to be trained on the fly, but are restored from ckpt
        # Then load the associated state feature basis and ss'-> e params as well.
        # if self.config.emb_lambda and self.config.load_embed:
        #     self.state_features.load(self.config.paths['ckpt'] + name + '.pt')
        #     self.action_representation.load(ss'->e features)

    def get_action(self, state, explore=0):
        explore = 0  # Don't do eps-greedy with policy gradients
        if self.initial_phase or np.random.rand() < explore:
            # take random actions (uniformly in actual action space) to observe the interactions initially
            action = np.random.randint(self.action_dim)
            chosen_action_emb = self.action_rep.get_embedding(
                action).cpu().view(-1).data.numpy()

        else:
            state = tensor(state,
                           dtype=float32,
                           requires_grad=False,
                           device=self.config.device)
            state = self.state_features.forward(state.view(1, -1))
            chosen_action_emb, _ = self.actor.get_action(state, explore=0)
            action = self.action_rep.get_best_match(chosen_action_emb)

            chosen_action_emb = chosen_action_emb.cpu().view(-1).data.numpy()

        return action, chosen_action_emb

    def update(self, s1, a1, a_emb1, r1, s2, done):
        if not self.initial_phase:

            # Off-policy episodes, If doing simultaneous online embedding optimization
            # if not self.config.true_embeddings and self.config.emb_lambda > 0:
            #     self.memory.add(s1, a1, a_emb1, r1, s2, int(done != 1))

            # On-policy episode history, # Dont use value predicted from the absorbing/goal state
            # self.optimize(s1, a1, a_emb1, r1, s2, int(done != 1))
            self.trajectory.add(s1, a1, a_emb1, r1, s2, int(done != 1))
            if self.trajectory.size >= self.config.batch_size or done:
                self.optimize()
                self.trajectory.reset()
        else:
            # action embeddings can be learnt offline
            self.memory.add(s1, a1, a_emb1, r1, s2, int(done != 1))
            if self.memory.length >= self.config.buffer_size:
                self.initial_phase_training(
                    max_epochs=self.config.initial_phase_epochs)

    def optimize(self):
        s1, a1, chosen_a1_emb, r1, s2, not_absorbing = self.trajectory.get_all(
        )

        s1 = self.state_features.forward(s1)
        s2 = self.state_features.forward(s2)

        # ---------------------- optimize critic ----------------------
        next_val = self.critic.forward(
            s2).detach()  # Detach targets from grad computation.
        val_exp = r1 + self.config.gamma * next_val * not_absorbing
        val_pred = self.critic.forward(s1)
        loss_critic = F.mse_loss(val_pred, val_exp)

        # loss_critic = F.smooth_l1_loss(val_pred, val_exp)
        # print(next_val.shape, val_pred.shape, val_exp.shape, r1.shape, not_absorbing.shape, exec_a1_emb.shape, s1.shape) #check correctness
        # print("------------------",next_val, val_pred, val_exp, r1, not_absorbing, a1_emb, s1, s2) #check correctness

        # ---------------------- optimize actor ----------------------
        td_error = (val_exp - val_pred).detach()
        logp, dist = self.actor.get_log_prob(s1, chosen_a1_emb)
        loss_actor = -1.0 * torch.mean(td_error * logp)
        # loss_actor += self.config.entropy_lambda * self.actor.get_entropy_from_dist(dist)

        # Take one policy gradient step
        loss = loss_critic + loss_actor
        self.step(loss, clip_norm=1)

        # Take one unsupervised step
        # if not self.config.true_embeddings and self.config.emb_lambda > 0:# and self.memory.size >self.config.sup_batch_size:
        #     s1, a1, _, _, s2, _ = self.memory.sample(batch_size=self.config.sup_batch_size)
        #     self.self_supervised_update(s1, a1, s2, reg=self.config.emb_lambda)

    def self_supervised_update(self, s1, a1, s2, reg=1):
        self.clear_gradients()  # clear all the gradients from last run

        # If doing online updates, sharing the state features might be problematic!
        s1 = self.state_features.forward(s1)
        s2 = self.state_features.forward(s2)

        # ------------ optimize the embeddings ----------------
        loss_act_rep = self.action_rep.unsupervised_loss(
            s1, a1.view(-1), s2, normalized=True) * reg
        loss_act_rep.backward()

        # Directly call the optimizer's step fn to bypass lambda traces (if any)
        self.action_rep.optim.step()
        self.state_features.optim.step()

        return loss_act_rep.item()

    def initial_phase_training(self, max_epochs=-1):
        # change optimizer to Adam for unsupervised learning
        self.action_rep.optim = torch.optim.Adam(self.action_rep.parameters(),
                                                 lr=1e-3)
        self.state_features.optim = torch.optim.Adam(
            self.state_features.parameters(), lr=1e-3)
        initial_losses = []

        print("Inital training phase started...")
        for counter in range(max_epochs):
            losses = []
            for s1, a1, _, _, s2, _ in self.memory.batch_sample(
                    batch_size=self.config.sup_batch_size, randomize=True):
                loss = self.self_supervised_update(s1, a1, s2)
                losses.append(loss)

            initial_losses.append(np.mean(losses))
            if counter % 1 == 0:
                print("Epoch {} loss:: {}".format(
                    counter, np.mean(initial_losses[-10:])))
                if self.config.only_phase_one:
                    self.save()
                    print("Saved..")

            # Terminate initial phase once action representations have converged.
            if len(initial_losses) >= 20 and np.mean(
                    initial_losses[-10:]) + 1e-5 >= np.mean(
                        initial_losses[-20:]):
                print("Converged...")
                break

        # Reset the optim to whatever is there in config
        self.action_rep.optim = self.config.optim(self.action_rep.parameters(),
                                                  lr=self.config.embed_lr)
        self.state_features.optim = self.config.optim(
            self.state_features.parameters(), lr=self.config.state_lr)

        print('... Initial training phase terminated!')
        self.initial_phase = False
        self.save()

        if self.config.only_phase_one:
            exit()

        # if not updating on the fly, then delete the memory buffer:
        del self.memory
Beispiel #8
0
class embed_DPG(Agent):
    def __init__(self, config):
        super(embed_DPG, self).__init__(config)

        # Set Hyper-parameters

        self.initial_phase =False# not config.true_embeddings and not config.load_embed  # Initial training phase required if learning embeddings
        self.batch_norm = False
        self.ctr = 0

        # Function to get state features and action representation
        self.action_rep = ActionRepresentation.Action_representation_deep(action_dim=self.action_dim, config=config)
        # Create instances for Actor and Q_fn
        self.actor = Actor(action_dim=self.action_rep.reduced_action_dim, config=config)
        self.Q = Q_fn(action_dim=self.action_rep.reduced_action_dim, config=config)

        # Create target networks
        # Deepcopy not working.
        self.target_actor = Actor(action_dim=self.action_rep.reduced_action_dim, config=config)
        self.target_Q = Q_fn(action_dim=self.action_rep.reduced_action_dim, config=config)
        # self.target_action_rep = ActionRepresentation.Action_representation_deep(action_dim=self.action_dim, config=config)
        # Copy the initialized values to target
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_Q.load_state_dict(self.Q.state_dict())
        # self.target_action_rep.load_state_dict(self.action_rep.state_dict())



        self.memory = MemoryBuffer(max_len=self.config.buffer_size, state_dim=self.state_dim,
                                     action_dim=1, atype=long, config=config,
                                     dist_dim=self.action_rep.reduced_action_dim)  # off-policy
        self.noise = OrnsteinUhlenbeckActionNoise(self.config.reduced_action_dim)


        self.modules = [('actor', self.actor), ('Q', self.Q), ('action_rep', self.action_rep),
                        ('target_actor', self.target_actor), ('target_Q', self.target_Q)]#,
                        # ('target_action_rep', self.target_action_rep)]

        self.init()

    def get_action(self, state, explore=0):
        if self.batch_norm: self.actor.eval()  # Set the actor to Evaluation mode. Required for Batchnorm

        if self.initial_phase:
            # take random actions (uniformly in actual action space) to observe the interactions initially
            action = np.random.randint(self.action_dim)
            action_emb = self.action_rep.get_embedding(action).cpu().view(-1).data.numpy()

        else:
            state = tensor(state, dtype=float32, requires_grad=False, device=self.config.device).view(1, -1)
            action_emb = self.actor.get_action(state)

            noise = self.noise.sample() #* 0.1
            action_emb += Variable(torch.from_numpy(noise).type(float32), requires_grad=False)

            action = self.action_rep.get_best_match(action_emb)
            action_emb = action_emb.cpu().view(-1).data.numpy()

        self.track_entropy_cont(action_emb)
        return action, action_emb

    def update(self, s1, a1, a_emb1, r1, s2, done):
        self.memory.add(s1, a1, a_emb1, r1, s2, int(done != 1))
        if self.initial_phase and self.memory.length >= self.config.buffer_size:
            self.initial_phase_training(max_epochs=self.config.initial_phase_epochs)
        elif not self.initial_phase and self.memory.length > self.config.sup_batch_size:
            self.optimize()

    def optimize(self):
        if self.batch_norm: self.actor.train()  # Set the actor to training mode. Required for Batchnorm

        s1, a1, a1_emb, r1, s2, not_absorbing = self.memory.sample(self.config.sup_batch_size)

        # print(s1.shape, a1.shape, a1_emb.shape, r1.shape, s2.shape, not_absorbing.shape)
        # ---------------------- optimize critic ----------------------
        # Use target actor exploitation policy here for loss evaluation
        a2_emb = self.target_actor.get_action(s2).detach()                      # Detach targets from grad computation.
        next_val = self.target_Q.forward(s2, a2_emb).detach()                   # Compute Q'( s2, pi'(s2))
        val_exp  = r1 + self.config.gamma * next_val * not_absorbing           # y_exp = r + gamma * Q'( s2, pi'(s2))

        val_pred = self.Q.forward(s1, a1_emb)                   # y_pred = Q( s1, a1)
        # loss_Q = F.smooth_l1_loss(val_pred, val_exp)                    # compute critic loss
        loss_Q = F.mse_loss(val_pred, val_exp)
        self.Q.update(loss_Q)

        # ---------------------- optimize actor ----------------------
        pred_a1_emb = self.actor.get_action(s1)
        loss_actor = -1.0 * torch.mean(self.Q.forward(s1, pred_a1_emb))
        self.actor.update(loss_actor)

        # ------------ update target actor and critic -----------------
        soft_update(self.target_actor, self.actor, self.config.tau)
        soft_update(self.target_Q, self.Q, self.config.tau)

        if not self.config.true_embeddings and self.config.emb_lambda > 0:
            self.ctr += 1
            if self.ctr > 100:
                self.self_supervised_training()
                self.ctr = 0


    def self_supervised_training(self, eps=1e-3):
        prv_loss = 1e5
        while True:
            s1, a1, _, _, s2, _ = self.memory.sample(batch_size=self.config.sup_batch_size)
            loss = self.action_rep.unsupervised_loss(s1, a1.view(-1), s2)
            self.action_rep.update(loss)
            # soft_update(self.target_action_rep, self.action_rep, self.config.tau)

            # quick check for convergence, break
            loss = loss.item()
            if prv_loss - loss < eps:
                break

            prv_loss = loss


    def initial_phase_training(self, max_epochs=-1):
        if self.batch_norm: self.actor.train()  # Set the actor to training mode. Required for Batchnorm

       # change optimizer to Adam for unsupervised learning
        self.action_rep.optim = torch.optim.Adam(self.action_rep.parameters(), lr=1e-3)
        initial_losses = []

        print("Inital training phase started...")
        for counter in range(max_epochs):
            losses = []
            for s1, a1, _, _, s2, _ in self.memory.batch_sample(batch_size=self.config.sup_batch_size,
                                                                randomize=True):
                loss_act_rep = self.action_rep.unsupervised_loss(s1, a1, s2)
                self.action_rep.update(loss_act_rep)
                losses.append(loss_act_rep.item())

            initial_losses.append(np.mean(losses))
            if counter % 1 == 0:
                print("Epoch {} loss:: {}".format(counter, np.mean(initial_losses[-10:])))
                if self.config.only_phase_one:
                    self.save()
                    print("Saved..")

            # Terminate initial phase once action representations have converged.
            if len(initial_losses) >= 20 and np.mean(initial_losses[-10:]) + 1e-5 >= np.mean(initial_losses[-20:]):
                print("Converged...")
                break

        # Reset the optim to whatever is there in config
        self.action_rep.optim = self.config.optim(self.action_rep.parameters(), lr=self.config.embed_lr)

        print('... Initial training phase terminated!')
        self.initial_phase = False
        self.save()

        if self.config.only_phase_one:
            exit()

        hard_update(self.target_action_rep, self.action_rep)