class DoubleQ(Agent):
    def __init__(self, params, name, task, load_path=None):
        super(DoubleQ, self).__init__(params, name, task)
        self.dual = self.vPars['dual']
        if self.trainMode:
            if self.dual:
                self.tarNet = DualNetwork(self.vPars, self.vTrain)
                self.valueNet = DualNetwork(self.vPars, self.vTrain)
            else:
                self.tarNet = Network(self.vPars, self.vTrain)
                self.valueNet = Network(self.vPars, self.vTrain)
                for target_param, param in zip(self.tarNet.parameters(),
                                               self.valueNet.parameters()):
                    target_param.data.copy_(param.data)
        else:
            self.valueNet = Network(self.vPars, self.vTrain)
            self.valueNet.load_state_dict(torch.load(load_path))
        self.out_n = self.vPars['neurons'][-1]
        self.replaceCounter = 0
        self.valueLoss = []
        self.avgLoss = 0
        self.expSize = self.vTrain['buffer']
        self.exp = Memory(size=self.expSize)
        self.beta = self.vPars['beta']

        self.priority = self.vTrain['priority']
        self.priorities = []
        self.alpha = .7

        self.double = self.vTrain['double']
        self.update_target_network = self.vTrain['update_target_network_every']
        if 'noise' in self.vTrain:
            self.noise = self.vTrain['noise']
        else:
            self.noise = 0

        task.initAgent(self)

        if not load_path:
            while (not self.stop):
                x = 1 + 1
            task.postTraining()

    def saveModel(self):
        torch.save(
            self.valueNet.state_dict(),
            '/home/jimmy/Documents/Research/AN_Bridging/results/hierarchical_q_policy2.txt'
        )
        pass

    def store(self, s, a, r, sprime, aprime, done):
        self.exp.push(s, a, r, 1 - done, aprime, sprime)
        if len(self.priorities) < self.expSize:
            self.priorities.append(1)
        else:
            self.priorities = self.priorities[1:]
            self.priorities.append(1)

    def get_q(self, s):
        if type(self.valueNet) == list:
            model_index = np.random.randint(len(self.valueNet))
            net = self.valueNet[model_index]
        else:
            net = self.valueNet
        q = net(torch.FloatTensor(s))
        q = q.detach()
        return q

    def get_action(self, s, testing_time=False, probabilistic=False):
        i = np.random.random()
        if i < self.explore and self.trainMode and not testing_time:
            index = np.random.randint(self.out_n)
        else:
            q = self.get_q(s)
            if probabilistic:
                q = q.numpy()
                q = q - np.max(q)
                probs = np.exp(q * self.beta)
                probs = probs / np.sum(probs)
                index = np.random.choice(q.size, p=probs.ravel())
                # print('probability chosen ', probs.ravel()[index])
            else:
                index = np.argmax(q.numpy())
        self.explore = max(.2, self.explore * .9997)
        return index

    def get_q_and_q_tar(self, states, actions, nextStates, rewards, masks):
        qValues = self.valueNet(
            torch.FloatTensor(states).squeeze(1))  #pass in. Processing implied
        q = torch.gather(
            qValues, 1,
            torch.LongTensor(actions).unsqueeze(1))  #get q values of actions
        qnext = self.tarNet(torch.FloatTensor(nextStates))
        qnext = qnext.squeeze(1).detach()  #pass in

        if self.double:
            qNextDouble = self.valueNet(torch.FloatTensor(nextStates))
            qNextDouble = qNextDouble.squeeze(1).detach()  #pass in
            qnext = torch.gather(
                qnext, 1, torch.LongTensor(qNextDouble.argmax(1).unsqueeze(1)))
            qtar = torch.FloatTensor(rewards).squeeze(
                1) + self.discount * torch.Tensor(masks).unsqueeze(1) * qnext
        else:
            qtar = torch.FloatTensor(rewards) + self.discount * torch.Tensor(
                masks).unsqueeze(1) * qnext.max(1)[0].view(
                    self.batch_size, 1)  #calculate target
        return q, qtar

    def train(self, override=False):
        if len(self.exp) >= 500 or override:
            if self.priority:
                loss = 0
                weights = []
                errors = []
                assert len(self.priorities) == len(self.exp)
                for i in range(self.batch_size):
                    probs = np.array(
                        [math.pow(p, self.alpha) for p in self.priorities])
                    probs = probs / np.sum(probs)
                    choice = np.random.choice(len(self.priorities),
                                              p=probs,
                                              size=1)
                    weights.append(
                        math.pow(
                            len(self.priorities) *
                            self.priorities[int(np.asscalar(choice))],
                            -self.beta))
                    states, actions, rewards, masks, _, nextStates, _, _, _ = self.exp.get_transitions(
                        choice)
                    q, qtar = self.get_q_and_q_tar(states, actions, nextStates,
                                                   rewards, masks)
                    td = qtar - q
                    self.priorities[int(np.asscalar(choice))] = abs(td[:, 0])
                    errors.append(self.valueNet.get_loss(q, qtar))
                max_weight = max(weights)
                weights = [w / max_weight for w in weights]
                val_loss = sum([w * e for w, e in zip(weights, errors)])

            else:
                states, actions, rewards, masks, _, nextStates, _, _, _ = self.exp.sample(
                    batch=self.batch_size)

                if self.replaceCounter % self.update_target_network == 0:
                    self.tarNet.load_state_dict(self.valueNet.state_dict())
                    self.replaceCounter = 0

                if self.noise:
                    states = np.array(states)
                    states = states + np.random.normal(0, self.noise,
                                                       states.shape)

                q, qtar = self.get_q_and_q_tar(states, actions, nextStates,
                                               rewards, masks)
                val_loss = self.valueNet.get_loss(q, qtar)

            self.valueNet.optimizer.zero_grad()
            val_loss.backward()
            self.valueNet.optimizer.step()

            self.replaceCounter += 1
            self.totalSteps += 1
            return val_loss
Beispiel #2
0
class DoubleQ(Agent):
    def __init__(self, params, name, task, load_path=None):
        super(DoubleQ, self).__init__(params, name, task)
        self.dual = self.vPars['dual']
        if self.trainMode:
            if self.dual:
                self.tarNet = DualNetwork(self.vPars, self.vTrain)
                self.valueNet = DualNetwork(self.vPars, self.vTrain)
            else:
                self.tarNet = Network(self.vPars, self.vTrain)
                self.valueNet = Network(self.vPars, self.vTrain)
                for target_param, param in zip(self.tarNet.parameters(),
                                               self.valueNet.parameters()):
                    target_param.data.copy_(param.data)
        else:
            self.valueNet = Network(self.vPars, self.vTrain)
            paths = [
                '/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/push_in_hole.txt',
                '/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/push_in_hole2.txt'
            ]
            if not load_path:
                self.valueNet = []
                for path in paths:
                    self.valueNet.append(Network(self.vPars, self.vTrain))
                    self.valueNet[-1].load_state_dict(torch.load(path))
            else:
                self.valueNet.load_state_dict(torch.load(load_path))
        self.out_n = self.vPars['neurons'][-1]
        self.replaceCounter = 0
        self.valueLoss = []
        self.avgLoss = 0
        self.expSize = self.vTrain['buffer']
        self.exp = Memory(size=self.expSize)
        self.double = self.vTrain['double']

        task.initAgent(self)

        if not load_path:
            while (not self.stop):
                x = 1 + 1
            task.postTraining()

    def saveModel(self):
        torch.save(
            self.valueNet.state_dict(),
            '/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/box_push_hierarchical_q_policy.txt'
        )
        pass

    def store(self, s, a, r, sprime, aprime, done):
        self.exp.push(s, a, r, 1 - done, aprime, sprime)

    def get_action(self, s):
        i = np.random.random()
        if i < self.explore and self.trainMode:
            index = np.random.randint(self.out_n)
        else:
            if type(self.valueNet) == list:
                model_index = np.random.randint(len(self.valueNet))
                net = self.valueNet[model_index]
            else:
                net = self.valueNet
            q = net(torch.FloatTensor(s))
            #print(q)
            q = q.detach()
            index = np.argmax(q.numpy())
        self.explore = max(.1, self.explore * .9997)
        return index

    def train(self):
        if len(self.exp) >= 500:
            states, actions, rewards, masks, _, nextStates, _, _, _ = self.exp.sample(
                batch=self.batch_size)

            if self.replaceCounter % 500 == 0:  # THIS IS SET TO 200 FOR BOX PUSH TASK...SLOPE IS 500
                self.tarNet.load_state_dict(self.valueNet.state_dict())
                self.replaceCounter = 0

            qValues = self.valueNet(torch.FloatTensor(states).squeeze(
                1))  #pass in. Processing implied
            q = torch.gather(qValues, 1,
                             torch.LongTensor(actions).unsqueeze(
                                 1))  #get q values of actions
            qnext = self.tarNet(torch.FloatTensor(nextStates))
            qnext = qnext.squeeze(1).detach()  #pass in

            if self.double:
                qNextDouble = self.valueNet(torch.FloatTensor(nextStates))
                qNextDouble = qNextDouble.squeeze(1).detach()  #pass in
                qnext = torch.gather(
                    qnext, 1,
                    torch.LongTensor(qNextDouble.argmax(1).unsqueeze(1)))
                qtar = torch.FloatTensor(rewards).squeeze(
                    1
                ) + self.discount * torch.Tensor(masks).unsqueeze(1) * qnext
            else:
                qtar = torch.FloatTensor(
                    rewards) + self.discount * torch.Tensor(
                        masks).unsqueeze(1) * qnext.max(1)[0].view(
                            self.batch_size, 1)  #calculate target

            val_loss = self.valueNet.get_loss(q, qtar)
            self.valueNet.optimizer.zero_grad()
            val_loss.backward()
            self.valueNet.optimizer.step()

            self.replaceCounter += 1
            self.totalSteps += 1
Beispiel #3
0
class A2C(Agent):
    def __init__(self, params, name, task):
        super(A2C, self).__init__(params, name, task)
        if self.trainMode:
            self.aPars = params['actPars']
            self.aTrain = params['actTrain']
            self.value = Network(self.vPars, self.vTrain)
            self.policyNet = A2CNetwork(self.aPars, self.aTrain)
        else:
            self.policyNet = Network(self.aPars, self.aTrain)
            self.policyNet.load_state_dict(
                torch.load(
                    "/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/PolicyNet.txt"
                ))

        self.exp = Replay(self.batch_size)
        self.replaceCounter = 0
        self.valueLoss = []
        self.actorLoss = []
        self.avgLoss = 0
        self.avgActLoss = 0

        task.initAgent(self)

        while (not self.stop):
            x = 1 + 1
        task.postTraining()

    def saveModel(self):
        #print("Network saved")
        pass

    def get_action(self):
        output = self.policyNet(torch.FloatTensor(s))
        action_mean = output[:, :int(out_n / 2)]
        action_logstd = output[:, int(out_n / 2):]
        action_std = torch.exp(action_logstd)
        action = (torch.normal(action_mean,
                               action_std).detach().numpy()).ravel()
        return action

    def train(self):
        if self.dataSize == self.batch_size:
            self.totalSteps += 1
            s, a, r, n_s, n_a, mask = self.exp.get_data()
            mask = torch.FloatTensor(np.where(
                mask > .5, 0, 1))  #if fail, equal to 1 so set mask to 0

            #Critic update
            vTar = torch.FloatTensor(
                r) + self.discount * self.value(n_s).detach()
            v = self.value(s)
            loss = self.value.loss_fnc(v, vTar)
            self.value.optimizer.zero_grad()
            loss.backward()
            self.value.optimizer.step()
            self.avgLoss += loss

            #Policy update:
            advantage = (vTar - v).detach()
            out = self.policyNet(s)
            mean = out[:, :int(self.out_n)]
            log_std = out[:, int(self.out_n):]
            log_prob = Normal(mean, torch.exp(log_std)).log_prob(
                torch.FloatTensor(a))
            entropy = -torch.sum(torch.exp(log_prob) * log_prob)
            grad = -torch.sum(log_prob * advantage) + .01 * entropy
            self.policyNet.optimizer.zero_grad()
            grad.backward()
            self.policyNet.optimizer.step()
            self.avgActLoss += grad

            #iteration updates
            self.trainIt += 1
            self.dataSize = 0
Beispiel #4
0
class Twin_DDPG(Agent):
    def __init__(self, params, name, task):
        super(Twin_DDPG, self).__init__(params, name, task)
        self.aPars = params['actPars']
        self.aTrain = params['actTrain']

        if self.trainMode:
            self.values = [
                Network(self.vPars, self.vTrain),
                Network(self.vPars, self.vTrain)
            ]
            self.policyNet = TD3Network(self.aPars, self.aTrain)
            self.tarPolicy = TD3Network(self.aPars, self.aTrain)

            if self.load:
                self.load_nets()

            self.tarPolicy.load_state_dict(self.policyNet.state_dict())
            self.tar = [
                Network(self.vPars, self.vTrain),
                Network(self.vPars, self.vTrain)
            ]
            for i in range(len(self.values)):
                self.tar[i].load_state_dict(self.values[i].state_dict())
        else:
            self.policyNet = Network(self.aPars, self.aTrain)
            self.policyNet.load_state_dict(
                torch.load(
                    "/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/TD3_goal_policy2.txt"
                ))

        self.base = self.vTrain['baseExplore']
        self.step = self.vTrain['decay']
        self.expSize = self.vTrain['buffer']
        self.exp = Replay(self.expSize)
        self.a = self.vTrain['a']
        self.tau = self.vPars['tau']
        self.smooth = self.vTrain['smooth']
        self.clip = self.vTrain['clip']
        self.delay = self.vTrain['policy_delay']
        self.mean_range = self.aPars['mean_range']
        self.noise = OUNoise(self.out_n,
                             mu=0,
                             theta=.15,
                             max_sigma=self.explore,
                             min_sigma=self.base,
                             decay=self.step)
        self.valueLoss = []
        self.actorLoss = []
        self.avgLoss = 0
        self.avgActLoss = 0

        task.initAgent(self)

        while (not self.stop):
            x = 1 + 1
        task.postTraining()

    def load_nets(self):
        path = "/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/TD3_goal3_"
        self.policyNet.load_state_dict(torch.load(path + "policy.txt"))
        self.values[0].load_state_dict(torch.load(path + "Qvalue1.txt"))
        self.values[1].load_state_dict(torch.load(path + "Qvalue2.txt"))

    def saveModel(self):
        path = "/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/TD3_goal3_"
        torch.save(self.policyNet.state_dict(), path + "policy.txt")
        torch.save(self.values[0].state_dict(), path + "Qvalue1.txt")
        torch.save(self.values[1].state_dict(), path + "Qvalue2.txt")
        print("Network saved")
        pass

    def get_action(self):
        output = self.policyNet(torch.FloatTensor(s))
        i = np.random.random()
        if i < self.explore[0]:
            #add in exploration TODO: put in OU noise
            noise = torch.from_numpy(np.random.normal(0, self.explore[1], 2))
            output = output + noise
        output = output.float()
        return output[0]

    def train(self):
        if self.dataSize > 500 and self.trainMode:
            #iteration updates
            self.trainIt += 1
            self.totalSteps += 1

            #Unpack
            s, a, r, n_s, n_a, done = self.exp.get_data()
            noise = torch.FloatTensor(
                np.random.normal(0, self.smooth, n_a.shape))

            c = np.random.choice(min(self.dataSize, self.expSize),
                                 self.batch_size)

            s = torch.FloatTensor(s[c])
            a = torch.FloatTensor(a[c])
            r = torch.FloatTensor(r[c])
            n_s = torch.FloatTensor(n_s[c])
            done = torch.FloatTensor(done[c])
            n_a = self.tarPolicy(n_s).detach().numpy()

            #target policy smoothing
            n_a_ = n_a + torch.clamp(noise, -self.clip, self.clip)
            n_sa = torch.cat((n_s, n_a), dim=1)
            qtar = torch.FloatTensor(r) + self.discount * (
                1 - done) * torch.min(self.tar[0](n_sa).detach(), self.tar[1]
                                      (n_sa).detach())  #pass in

            #Value update
            sa = torch.cat((s, a), dim=1)
            for qnet in self.values:
                q = qnet(sa)
                loss = qnet.loss_fnc(q, qtar)
                qnet.optimizer.zero_grad()
                loss.backward()
                qnet.optimizer.step()
                qnet.scheduler.step()
                self.avgLoss += loss / len(self.values)

            #policy update
            if self.trainIt % self.delay == 0:
                act = self.policyNet(s)
                s_a = torch.cat((s, act), 1)
                q = self.values[0](s_a)
                policy_loss = -q.mean()

                self.policyNet.optimizer.zero_grad()
                policy_loss.backward()
                self.policyNet.optimizer.step()
                self.policyNet.scheduler.step()
                self.avgActLoss += policy_loss

                for target_param, param in zip(self.tarPolicy.parameters(),
                                               self.policyNet.parameters()):
                    target_param.data.copy_(self.tau * param.data +
                                            (1.0 - self.tau) *
                                            target_param.data)

                for i in range(len(self.values)):
                    for target_param, param in zip(
                            self.tar[i].parameters(),
                            self.values[i].parameters()):
                        target_param.data.copy_(self.tau * param.data +
                                                (1.0 - self.tau) *
                                                target_param.data)