Beispiel #1
0
    def __init__(self, params, name, task, load_path=None):
        super(DoubleQ, self).__init__(params, name, task)
        self.dual = self.vPars['dual']
        if self.trainMode:
            if self.dual:
                self.tarNet = DualNetwork(self.vPars, self.vTrain)
                self.valueNet = DualNetwork(self.vPars, self.vTrain)
            else:
                self.tarNet = Network(self.vPars, self.vTrain)
                self.valueNet = Network(self.vPars, self.vTrain)
                for target_param, param in zip(self.tarNet.parameters(), self.valueNet.parameters()):
                    target_param.data.copy_(param.data)
        else:
            self.valueNet = Network(self.vPars, self.vTrain)
            paths = ['/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/model_training_data/hierarchical_q_policy2.txt']
            if not load_path:
                self.valueNet = []
                for path in paths:
                    self.valueNet.append(Network(self.vPars, self.vTrain))
                    self.valueNet[-1].load_state_dict(torch.load(path))
            else:
                self.valueNet.load_state_dict(torch.load(load_path))
        self.out_n = self.vPars['neurons'][-1]
        self.replaceCounter = 0
        self.valueLoss = []
        self.avgLoss = 0
        self.expSize =self.vTrain['buffer']
        self.exp = Memory(size = self.expSize)

        self.priority = self.vTrain['priority']
        self.priorities = []
        self.beta = .5
        self.alpha = .7

        self.double = self.vTrain['double']
        if 'noise' in self.vTrain:
            self.noise = self.vTrain['noise']
        else:
            self.noise = 0
        

        task.initAgent(self)
    
        if not load_path:
            while(not self.stop):
                x = 1+1
            task.postTraining()
    def __init__(self, params, name, task, load_path=None):
        super(DoubleQ, self).__init__(params, name, task)
        self.dual = self.vPars['dual']
        if self.trainMode:
            if self.dual:
                self.tarNet = DualNetwork(self.vPars, self.vTrain)
                self.valueNet = DualNetwork(self.vPars, self.vTrain)
            else:
                self.tarNet = Network(self.vPars, self.vTrain)
                self.valueNet = Network(self.vPars, self.vTrain)
                for target_param, param in zip(self.tarNet.parameters(),
                                               self.valueNet.parameters()):
                    target_param.data.copy_(param.data)
        else:
            self.valueNet = Network(self.vPars, self.vTrain)
            self.valueNet.load_state_dict(torch.load(load_path))
        self.out_n = self.vPars['neurons'][-1]
        self.replaceCounter = 0
        self.valueLoss = []
        self.avgLoss = 0
        self.expSize = self.vTrain['buffer']
        self.exp = Memory(size=self.expSize)
        self.beta = self.vPars['beta']

        self.priority = self.vTrain['priority']
        self.priorities = []
        self.alpha = .7

        self.double = self.vTrain['double']
        self.update_target_network = self.vTrain['update_target_network_every']
        if 'noise' in self.vTrain:
            self.noise = self.vTrain['noise']
        else:
            self.noise = 0

        task.initAgent(self)

        if not load_path:
            while (not self.stop):
                x = 1 + 1
            task.postTraining()
    def __init__(self, params, name, task):
        self.name = name
        self.task = task
        self.vTrain = params['valTrain']
        self.vPars = params['valPars']
        self.aTrain = params['actTrain']
        self.aPars = params['actPars']
        self.agents = params['agents']

        self.pubs = {}
        for key in self.agents.keys():
            bot = self.agents[key]
            self.pubs[key] = rospy.Publisher(bot['pub'], Vector3, queue_size=1)
        rospy.Subscriber("/finished", Int8, self.receiveDone, queue_size=1)

        self.valueLoss = []
        self.actorLoss = []

        self.h_state_n = self.aPars['h_state_n']
        self.x_state_n = self.aPars['x_state_n']
        self.u_n = self.aPars['u_n']
        self.clip_grad_norm = self.aTrain['clip']
        self.homogenous = self.aPars['share_params']

        self.critic = Network(self.vPars, self.vTrain).to(device)
        self.target = Network(self.vPars, self.vTrain).to(device)
        if self.homogenous:
            self.actor = SoftNetwork(self.aPars, self.aTrain).to(device)
        else:
            self.actor = [
                SoftNetwork(self.aPars, self.aTrain)
                for i in range(len(self.agents))
            ]

        for target_param, param in zip(self.target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)

        self.clip_grad_norm = self.aTrain['clip']
        self.trainMode = self.vPars['trainMode']
        self.batch_size = self.vTrain['batch_size']
        self.discount = self.vTrain['gamma']
        self.range = self.aPars['mean_range']
        self.td_lambda = .8
        self.tau = .005
        self.lower_bound = self.aTrain['clamp'][2]
        self.stop = False
        self.trained = False

        self.exp = Memory()

        self.totalSteps = 0

        self.reset()

        task.initAgent(self)

        while (not self.stop):
            x = 1 + 1

        task.postTraining()
class CounterContinuous(object):
    def __init__(self, params, name, task):
        self.name = name
        self.task = task
        self.vTrain = params['valTrain']
        self.vPars = params['valPars']
        self.aTrain = params['actTrain']
        self.aPars = params['actPars']
        self.agents = params['agents']

        self.pubs = {}
        for key in self.agents.keys():
            bot = self.agents[key]
            self.pubs[key] = rospy.Publisher(bot['pub'], Vector3, queue_size=1)
        rospy.Subscriber("/finished", Int8, self.receiveDone, queue_size=1)

        self.valueLoss = []
        self.actorLoss = []

        self.h_state_n = self.aPars['h_state_n']
        self.x_state_n = self.aPars['x_state_n']
        self.u_n = self.aPars['u_n']
        self.clip_grad_norm = self.aTrain['clip']
        self.homogenous = self.aPars['share_params']

        self.critic = Network(self.vPars, self.vTrain).to(device)
        self.target = Network(self.vPars, self.vTrain).to(device)
        if self.homogenous:
            self.actor = SoftNetwork(self.aPars, self.aTrain).to(device)
        else:
            self.actor = [
                SoftNetwork(self.aPars, self.aTrain)
                for i in range(len(self.agents))
            ]

        for target_param, param in zip(self.target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)

        self.clip_grad_norm = self.aTrain['clip']
        self.trainMode = self.vPars['trainMode']
        self.batch_size = self.vTrain['batch_size']
        self.discount = self.vTrain['gamma']
        self.range = self.aPars['mean_range']
        self.td_lambda = .8
        self.tau = .005
        self.lower_bound = self.aTrain['clamp'][2]
        self.stop = False
        self.trained = False

        self.exp = Memory()

        self.totalSteps = 0

        self.reset()

        task.initAgent(self)

        while (not self.stop):
            x = 1 + 1

        task.postTraining()

    def receiveDone(self, message):
        if message.data == 1:  #all iterations are done. Check manager.py
            self.stop = True
        if message.data == 2:  #timed out. Check manager.py
            self.task.restartProtocol(restart=1)

    def get_action(self, s_true, s_split):
        if self.homogenous:
            a1, log_prob1, z, mu1, log_std1 = self.actor(
                torch.FloatTensor(s_split[0]))
            a2, log_prob2, z, mu2, log_std2 = self.actor(
                torch.FloatTensor(s_split[1]))
        else:  # TODO: Fix this below:
            a1, h_new1, log_prob1, mu1, std1 = self.actor[0](torch.FloatTensor(
                s_split[0]), self.h[0])
            a2, h_new2, log_prob2, mu2, std2 = self.actor[1](torch.FloatTensor(
                s_split[1]), self.h[1])
        action = [a1.detach().numpy().ravel(), a2.detach().numpy().ravel()]
        return [a1, a2]

    def choose(self, policies):
        m = Categorical(policies)
        action = m.sample()
        action = action.data.cpu().numpy()
        return np.asscalar(action)

    def saveModel(self):
        pass

    def store(self, s, a, r, sprime, aprime, done, local, next_local):
        self.exp.push(s, a, r, 1 - done, aprime, sprime, local, next_local)

    def reset(self):
        curr = self.actor.clamp[0]
        if self.trained:
            new = max(self.lower_bound, .05 * self.lower_bound + .95 * curr)
            self.actor.clamp = (new, self.actor.clamp[1], self.lower_bound)
        self.trained = False
        return

    def get_grad_norm(self, model):
        total_norm = 0
        for p in model.parameters():
            if p.grad is None:
                continue
            param_norm = p.grad.data.norm(2)
            total_norm += param_norm.item()**2
        grad_norm = total_norm**(1. / 2)
        return grad_norm

    def get_lambda_targets(self, rewards, mask, gamma, target_qs):
        target_qs = target_qs.squeeze()
        ret = target_qs.new_zeros(*target_qs.shape)
        ret[-1] = target_qs[-1] * mask[-1]

        for t in range(ret.size()[0] - 2, -1, -1):
            ret[t] = self.td_lambda * gamma * ret[t + 1] + \
                mask[t] * (rewards[t] + (1 - self.td_lambda) * gamma * target_qs[t + 1])
        return ret.unsqueeze(1)

    def zipStack(self, data):
        data = zip(*data)
        data = [torch.stack(d).squeeze().to(device) for d in data]
        return data

    def monte_carlo(self, mean, std, n=500):
        # returns tensors representing n sampled from mean and std
        normal = Normal(mean, std)
        return normal.sample((n, ))

    def train(self, episode_done=False):
        if len(self.exp) >= 500:

            transition = self.exp.sample(self.batch_size)
            states = torch.squeeze(torch.Tensor(transition.state)).to(device)
            actions = self.zipStack(transition.action)
            rewards = torch.Tensor(transition.reward).to(device)
            states_next = torch.squeeze(torch.Tensor(
                transition.next_state)).to(device)
            masks = torch.Tensor(transition.mask).to(device)
            local = self.zipStack(transition.local)
            next_local = self.zipStack(transition.next_local)

            actions_next = []
            for s in next_local:
                a, log_prob, _, _, _ = self.actor(s)
                actions_next.append(a.detach())
            inp = torch.cat((states_next, actions_next[0], actions_next[1]),
                            dim=1)
            q_tar = rewards.unsqueeze(
                1) + self.discount * masks.unsqueeze(1) * self.target(inp)
            inp = torch.cat((states, actions[0].detach(), actions[1].detach()),
                            dim=1)
            q = self.critic(inp)
            loss = self.critic.get_loss(q, q_tar.detach())
            self.critic.optimizer.zero_grad()
            loss.backward()
            self.critic.optimizer.step()
            self.valueLoss.append(loss)

            actor_loss = 0
            actions = []
            means = []
            log_stds = []
            log_probs = []
            for s in local:
                a, log_prob, z, mu, log_std = self.actor(s)
                actions.append(a)
                means.append(mu)
                log_stds.append(log_std)
                log_probs.append(log_prob)

            # train first agent
            inp = torch.cat((states, actions[0], actions[1].detach()), dim=1)
            q_out = self.critic(inp)
            samples = self.monte_carlo(means[0], log_stds[0].exp())
            samples = self.range * torch.tanh(samples)
            repeat_s = states.unsqueeze(0)
            repeat_s = repeat_s.expand(samples.size()[0],
                                       repeat_s.size()[1],
                                       repeat_s.size()[2])
            repeat_a = actions[1].unsqueeze(0)
            repeat_a = repeat_a.expand(samples.size()[0],
                                       repeat_a.size()[1],
                                       repeat_a.size()[2])
            inp = torch.cat((repeat_s, samples, repeat_a), dim=2)
            baseline = self.critic(inp).mean(0)
            coma = (q_out - baseline).detach()
            actor_loss -= (log_probs[0].view(coma.size()) * (coma)).mean()

            # train second agent
            inp = torch.cat((states, actions[0].detach(), actions[1]), dim=1)
            q_out = self.critic(inp)
            samples = self.monte_carlo(means[1], log_stds[1].exp())
            samples = self.range * torch.tanh(samples)
            repeat_a = actions[0].unsqueeze(0)
            repeat_a = repeat_a.expand(samples.size()[0],
                                       repeat_a.size()[1],
                                       repeat_a.size()[2])
            inp = torch.cat((repeat_s, repeat_a, samples), dim=2)
            baseline = self.critic(inp).mean(0)
            coma = (q_out - baseline).detach()
            actor_loss -= (log_probs[1].view(coma.size()) * (coma)).mean()

            if self.homogenous:
                self.actor.optimizer.zero_grad()
                actor_loss.backward()
                self.actor.optimizer.step()
            else:
                for actor in self.actor:
                    actor.optimizer.zero_grad()
                actor_loss.backward()
                for actor in self.actor:
                    torch.nn.utils.clip_grad_norm_(actor.parameters(),
                                                   self.clip_grad_norm)
                    actor.optimizer.step()
            self.totalSteps += 1
            self.trained = True

            #UPDATE TARGET NETWORK:
            if self.totalSteps % 50 == 0:
                for target_param, param in zip(self.target.parameters(),
                                               self.critic.parameters()):
                    target_param.data.copy_(param.data)
            return
class DoubleQ(Agent):
    def __init__(self, params, name, task, load_path=None):
        super(DoubleQ, self).__init__(params, name, task)
        self.dual = self.vPars['dual']
        if self.trainMode:
            if self.dual:
                self.tarNet = DualNetwork(self.vPars, self.vTrain)
                self.valueNet = DualNetwork(self.vPars, self.vTrain)
            else:
                self.tarNet = Network(self.vPars, self.vTrain)
                self.valueNet = Network(self.vPars, self.vTrain)
                for target_param, param in zip(self.tarNet.parameters(),
                                               self.valueNet.parameters()):
                    target_param.data.copy_(param.data)
        else:
            self.valueNet = Network(self.vPars, self.vTrain)
            self.valueNet.load_state_dict(torch.load(load_path))
        self.out_n = self.vPars['neurons'][-1]
        self.replaceCounter = 0
        self.valueLoss = []
        self.avgLoss = 0
        self.expSize = self.vTrain['buffer']
        self.exp = Memory(size=self.expSize)
        self.beta = self.vPars['beta']

        self.priority = self.vTrain['priority']
        self.priorities = []
        self.alpha = .7

        self.double = self.vTrain['double']
        self.update_target_network = self.vTrain['update_target_network_every']
        if 'noise' in self.vTrain:
            self.noise = self.vTrain['noise']
        else:
            self.noise = 0

        task.initAgent(self)

        if not load_path:
            while (not self.stop):
                x = 1 + 1
            task.postTraining()

    def saveModel(self):
        torch.save(
            self.valueNet.state_dict(),
            '/home/jimmy/Documents/Research/AN_Bridging/results/hierarchical_q_policy2.txt'
        )
        pass

    def store(self, s, a, r, sprime, aprime, done):
        self.exp.push(s, a, r, 1 - done, aprime, sprime)
        if len(self.priorities) < self.expSize:
            self.priorities.append(1)
        else:
            self.priorities = self.priorities[1:]
            self.priorities.append(1)

    def get_q(self, s):
        if type(self.valueNet) == list:
            model_index = np.random.randint(len(self.valueNet))
            net = self.valueNet[model_index]
        else:
            net = self.valueNet
        q = net(torch.FloatTensor(s))
        q = q.detach()
        return q

    def get_action(self, s, testing_time=False, probabilistic=False):
        i = np.random.random()
        if i < self.explore and self.trainMode and not testing_time:
            index = np.random.randint(self.out_n)
        else:
            q = self.get_q(s)
            if probabilistic:
                q = q.numpy()
                q = q - np.max(q)
                probs = np.exp(q * self.beta)
                probs = probs / np.sum(probs)
                index = np.random.choice(q.size, p=probs.ravel())
                # print('probability chosen ', probs.ravel()[index])
            else:
                index = np.argmax(q.numpy())
        self.explore = max(.2, self.explore * .9997)
        return index

    def get_q_and_q_tar(self, states, actions, nextStates, rewards, masks):
        qValues = self.valueNet(
            torch.FloatTensor(states).squeeze(1))  #pass in. Processing implied
        q = torch.gather(
            qValues, 1,
            torch.LongTensor(actions).unsqueeze(1))  #get q values of actions
        qnext = self.tarNet(torch.FloatTensor(nextStates))
        qnext = qnext.squeeze(1).detach()  #pass in

        if self.double:
            qNextDouble = self.valueNet(torch.FloatTensor(nextStates))
            qNextDouble = qNextDouble.squeeze(1).detach()  #pass in
            qnext = torch.gather(
                qnext, 1, torch.LongTensor(qNextDouble.argmax(1).unsqueeze(1)))
            qtar = torch.FloatTensor(rewards).squeeze(
                1) + self.discount * torch.Tensor(masks).unsqueeze(1) * qnext
        else:
            qtar = torch.FloatTensor(rewards) + self.discount * torch.Tensor(
                masks).unsqueeze(1) * qnext.max(1)[0].view(
                    self.batch_size, 1)  #calculate target
        return q, qtar

    def train(self, override=False):
        if len(self.exp) >= 500 or override:
            if self.priority:
                loss = 0
                weights = []
                errors = []
                assert len(self.priorities) == len(self.exp)
                for i in range(self.batch_size):
                    probs = np.array(
                        [math.pow(p, self.alpha) for p in self.priorities])
                    probs = probs / np.sum(probs)
                    choice = np.random.choice(len(self.priorities),
                                              p=probs,
                                              size=1)
                    weights.append(
                        math.pow(
                            len(self.priorities) *
                            self.priorities[int(np.asscalar(choice))],
                            -self.beta))
                    states, actions, rewards, masks, _, nextStates, _, _, _ = self.exp.get_transitions(
                        choice)
                    q, qtar = self.get_q_and_q_tar(states, actions, nextStates,
                                                   rewards, masks)
                    td = qtar - q
                    self.priorities[int(np.asscalar(choice))] = abs(td[:, 0])
                    errors.append(self.valueNet.get_loss(q, qtar))
                max_weight = max(weights)
                weights = [w / max_weight for w in weights]
                val_loss = sum([w * e for w, e in zip(weights, errors)])

            else:
                states, actions, rewards, masks, _, nextStates, _, _, _ = self.exp.sample(
                    batch=self.batch_size)

                if self.replaceCounter % self.update_target_network == 0:
                    self.tarNet.load_state_dict(self.valueNet.state_dict())
                    self.replaceCounter = 0

                if self.noise:
                    states = np.array(states)
                    states = states + np.random.normal(0, self.noise,
                                                       states.shape)

                q, qtar = self.get_q_and_q_tar(states, actions, nextStates,
                                               rewards, masks)
                val_loss = self.valueNet.get_loss(q, qtar)

            self.valueNet.optimizer.zero_grad()
            val_loss.backward()
            self.valueNet.optimizer.step()

            self.replaceCounter += 1
            self.totalSteps += 1
            return val_loss
Beispiel #6
0
    def __init__(self, params, name, task):
        self.name = name
        self.task = task
        self.vTrain = params['valTrain']
        self.vPars = params['valPars']
        self.aTrain = params['actTrain']
        self.aPars = params['actPars']
        self.agents = params['agents']

        self.pubs = OrderedDict()
        for key in self.agents.keys():
            bot = self.agents[key]
            self.pubs[key] = rospy.Publisher(bot['pub'], Vector3, queue_size=1)
        rospy.Subscriber("/finished", Int8, self.receiveDone, queue_size=1)

        self.valueLoss = []
        self.actorLoss = []

        self.h_state_n = self.aPars['h_state_n']
        self.x_state_n = self.aPars['x_state_n']
        self.u_n = self.aPars['u_n']
        self.clip_grad_norm = self.aTrain['clip']
        self.homogenous = self.aPars['share_params']

        self.critic = Network(self.vPars, self.vTrain).to(device)
        self.target = Network(self.vPars, self.vTrain).to(device)
        if self.homogenous:
            self.actor = CounterActor(self.aPars, self.aTrain).to(device)
        else:
            self.actor = [
                CounterActor(self.aPars, self.aTrain)
                for i in range(len(self.agents))
            ]

        for target_param, param in zip(self.target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)

        self.clip_grad_norm = self.aTrain['clip']
        self.trainMode = self.vPars['trainMode']
        self.batch_size = self.vTrain['batch']
        self.discount = self.vTrain['gamma']
        self.temp_second = None
        self.temp_first = None
        self.td_lambda = 0  # TEST: this is because we are doing ER off-policy
        self.tau = .01
        self.stop = False
        self.trained = False

        self.exp = Memory()

        self.totalSteps = 0

        self.reset()

        task.initAgent(self)

        while (not self.stop):
            x = 1 + 1

        task.postTraining()
Beispiel #7
0
class Counter(object):
    def __init__(self, params, name, task):
        self.name = name
        self.task = task
        self.vTrain = params['valTrain']
        self.vPars = params['valPars']
        self.aTrain = params['actTrain']
        self.aPars = params['actPars']
        self.agents = params['agents']

        self.pubs = OrderedDict()
        for key in self.agents.keys():
            bot = self.agents[key]
            self.pubs[key] = rospy.Publisher(bot['pub'], Vector3, queue_size=1)
        rospy.Subscriber("/finished", Int8, self.receiveDone, queue_size=1)

        self.valueLoss = []
        self.actorLoss = []

        self.h_state_n = self.aPars['h_state_n']
        self.x_state_n = self.aPars['x_state_n']
        self.u_n = self.aPars['u_n']
        self.clip_grad_norm = self.aTrain['clip']
        self.homogenous = self.aPars['share_params']

        self.critic = Network(self.vPars, self.vTrain).to(device)
        self.target = Network(self.vPars, self.vTrain).to(device)
        if self.homogenous:
            self.actor = CounterActor(self.aPars, self.aTrain).to(device)
        else:
            self.actor = [
                CounterActor(self.aPars, self.aTrain)
                for i in range(len(self.agents))
            ]

        for target_param, param in zip(self.target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)

        self.clip_grad_norm = self.aTrain['clip']
        self.trainMode = self.vPars['trainMode']
        self.batch_size = self.vTrain['batch']
        self.discount = self.vTrain['gamma']
        self.temp_second = None
        self.temp_first = None
        self.td_lambda = 0  # TEST: this is because we are doing ER off-policy
        self.tau = .01
        self.stop = False
        self.trained = False

        self.exp = Memory()

        self.totalSteps = 0

        self.reset()

        task.initAgent(self)

        while (not self.stop):
            x = 1 + 1

        task.postTraining()

    def receiveDone(self, message):
        if message.data == 1:  #all iterations are done. Check manager.py
            self.stop = True
        if message.data == 2:  #timed out. Check manager.py
            self.task.restartProtocol(restart=1)

    def get_action(self, s_true, s_split):
        if self.homogenous:
            policy1 = self.actor(torch.FloatTensor(s_split[0]))
            a1 = np.asscalar(self.choose(policy1))
            policy2 = self.actor(torch.FloatTensor(s_split[1]))
            a2 = np.asscalar(self.choose(policy2))
        else:
            policy1 = self.actor[0](torch.FloatTensor(s_split[0]))
            a1 = self.choose(policy1)
            policy2 = self.actor[1](torch.FloatTensor(s_split[1]))
            a2 = self.choose(policy2)
        # THIS IS A TEST
        a1 = 0
        #print(policy1)
        #print(policy2)
        #print('')
        return [a1, a2]

    def choose(self, policies):
        m = Categorical(policies)
        action = m.sample()
        action = action.data.cpu().numpy()
        return action

    def saveModel(self):
        pass

    def store(self, s, a, r, sprime, aprime, done, local, next_local):
        self.exp.push(s, a, r, 1 - done, aprime, sprime, local, next_local,
                      None)

    def reset(self):
        self.train(True)
        if self.trained:
            self.actor.eps = max(.05, self.actor.eps - .003)
        self.trained = False
        self.temp_first, self.temp_second = (None, None)
        self.h = [
            torch.zeros((1, 1, self.h_state_n))
            for i in range(len(self.agents))
        ]
        self.prevAction = [-1, -1]
        return

    def zipStack(self, data):
        data = zip(*data)
        data = [torch.stack(d).squeeze().to(device) for d in data]
        return data

    def get_lambda_targets(self, rewards, mask, gamma, target_qs):
        target_qs = target_qs.squeeze()
        ret = target_qs.new_zeros(*target_qs.shape)
        ret[-1] = rewards[-1] + target_qs[-1] * mask[-1]

        for t in range(ret.size()[0] - 2, -1, -1):
            ret[t] = mask[t] * (self.td_lambda * gamma * ret[t + 1]) + (
                rewards[t] +
                (1 - self.td_lambda) * gamma * target_qs[t] * mask[t])
        return ret.unsqueeze(1)

    def train(self, episode_done=False):
        if len(self.exp) > self.batch_size:
            transition = self.exp.sample(self.batch_size)
            states = torch.squeeze(torch.Tensor(transition.state)).to(device)
            states_next = torch.squeeze(torch.Tensor(
                transition.next_state)).to(device)
            actions = torch.Tensor(transition.action).float().to(device)
            rewards = torch.Tensor(transition.reward).to(device)
            masks = torch.Tensor(transition.mask).to(device)
            local = self.zipStack(transition.local)
            next_local = self.zipStack(transition.next_local)

            actions_next = []
            for s in next_local:
                next_policy = self.actor(s)
                next_action = self.choose(next_policy)
                actions_next.append(torch.Tensor(next_action))
            '''# Critic Update
            ID = torch.Tensor(states.size()[0], 1).fill_(-1)
            inp = torch.cat((states_next, actions_next[1].unsqueeze(1), ID), dim = 1)
            q_tar = self.target(inp).detach().gather(1, actions_next[0].long().unsqueeze(1))
            q_tar = self.get_lambda_targets(rewards.squeeze(), masks.squeeze(), self.discount, q_tar)
            inp = torch.cat((states, actions[:, 1].unsqueeze(1), ID), dim = 1)
            q = self.critic(inp)
            q = q.gather(1, actions[:, 0].long().unsqueeze(1))
            loss = self.critic.get_loss(q, q_tar)
            self.critic.optimizer.zero_grad()
            loss.backward()
            self.critic.optimizer.step()'''

            ID = torch.Tensor(states.size()[0], 1).fill_(1)
            inp = torch.cat((states_next, actions_next[0].unsqueeze(1), ID),
                            dim=1)
            q_tar = self.target(inp).detach().gather(
                1, actions_next[1].long().unsqueeze(1))  # .max(1)?
            q_tar = self.get_lambda_targets(rewards.squeeze(), masks.squeeze(),
                                            self.discount, q_tar)
            inp = torch.cat((states, actions[:, 0].unsqueeze(1), ID), dim=1)
            q = self.critic(inp)
            q = q.gather(1, actions[:, 1].long().unsqueeze(1))
            loss = self.critic.get_loss(q, q_tar)
            self.critic.optimizer.zero_grad()
            loss.backward()
            self.critic.optimizer.step()

            actor_loss = 0
            # Actor Update. Consider doing new_actions
            policies = []
            new_actions = []
            for s in local:
                policy = self.actor(s)
                policies.append(policy)
                new_action = self.choose(policy)
                new_actions.append(torch.Tensor(new_action))
            '''ID = torch.Tensor(states.size()[0], 1).fill_(-1)
            inp = torch.cat((states, new_actions[1].unsqueeze(1), ID), dim = 1)
            q_out = self.critic(inp) #batch x num_actions
            policy = policies[0] #batch x num_actions
            mult = q_out * policy
            baseline = torch.sum(mult, 1).unsqueeze(1)
            q_taken = q_out.gather(1, new_actions[0].long().unsqueeze(1))
            coma = (q_taken - baseline).detach()
            probs_taken = policy.gather(1, new_actions[0].long().unsqueeze(1))
            loss = -(torch.log(probs_taken) * coma).mean()
            actor_loss += loss '''

            ID = torch.Tensor(states.size()[0], 1).fill_(1)
            inp = torch.cat((states, new_actions[0].unsqueeze(1), ID), dim=1)
            q_out = self.critic(inp)  #batch x num_actions
            policy = policies[1]  #batch x num_actions
            mult = q_out * policy
            baseline = torch.sum(mult, 1).unsqueeze(1)
            q_taken = q_out.gather(1, new_actions[1].long().unsqueeze(1))
            coma = (q_taken - baseline).detach()
            probs_taken = policy.gather(1, new_actions[1].long().unsqueeze(1))
            loss = -(torch.log(probs_taken) * coma).mean()
            actor_loss += loss

            self.actorLoss.append(actor_loss)

            if self.homogenous:
                self.actor.optimizer.zero_grad()
                actor_loss.backward()
                nn.utils.clip_grad_norm_(self.actor.parameters(), 1)
                self.actor.optimizer.step()
            else:
                for actor in self.actor:
                    actor.optimizer.zero_grad()
                actor_loss.backward()
                for actor in self.actor:
                    actor.optimizer.step()

            self.totalSteps += 1
            # self.exp = Memory()
            self.trained = True

            #UPDATE TARGET NETWORK:
            if self.totalSteps % 1 == 0:  # THIS IS A TEST
                for target_param, param in zip(self.target.parameters(),
                                               self.critic.parameters()):
                    target_param.data.copy_((1 - self.tau) * target_param +
                                            self.tau * param.data)

            return
Beispiel #8
0
class DoubleQ(Agent):
    def __init__(self, params, name, task, load_path=None):
        super(DoubleQ, self).__init__(params, name, task)
        self.dual = self.vPars['dual']
        if self.trainMode:
            if self.dual:
                self.tarNet = DualNetwork(self.vPars, self.vTrain)
                self.valueNet = DualNetwork(self.vPars, self.vTrain)
            else:
                self.tarNet = Network(self.vPars, self.vTrain)
                self.valueNet = Network(self.vPars, self.vTrain)
                for target_param, param in zip(self.tarNet.parameters(),
                                               self.valueNet.parameters()):
                    target_param.data.copy_(param.data)
        else:
            self.valueNet = Network(self.vPars, self.vTrain)
            paths = [
                '/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/push_in_hole.txt',
                '/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/push_in_hole2.txt'
            ]
            if not load_path:
                self.valueNet = []
                for path in paths:
                    self.valueNet.append(Network(self.vPars, self.vTrain))
                    self.valueNet[-1].load_state_dict(torch.load(path))
            else:
                self.valueNet.load_state_dict(torch.load(load_path))
        self.out_n = self.vPars['neurons'][-1]
        self.replaceCounter = 0
        self.valueLoss = []
        self.avgLoss = 0
        self.expSize = self.vTrain['buffer']
        self.exp = Memory(size=self.expSize)
        self.double = self.vTrain['double']

        task.initAgent(self)

        if not load_path:
            while (not self.stop):
                x = 1 + 1
            task.postTraining()

    def saveModel(self):
        torch.save(
            self.valueNet.state_dict(),
            '/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/box_push_hierarchical_q_policy.txt'
        )
        pass

    def store(self, s, a, r, sprime, aprime, done):
        self.exp.push(s, a, r, 1 - done, aprime, sprime)

    def get_action(self, s):
        i = np.random.random()
        if i < self.explore and self.trainMode:
            index = np.random.randint(self.out_n)
        else:
            if type(self.valueNet) == list:
                model_index = np.random.randint(len(self.valueNet))
                net = self.valueNet[model_index]
            else:
                net = self.valueNet
            q = net(torch.FloatTensor(s))
            #print(q)
            q = q.detach()
            index = np.argmax(q.numpy())
        self.explore = max(.1, self.explore * .9997)
        return index

    def train(self):
        if len(self.exp) >= 500:
            states, actions, rewards, masks, _, nextStates, _, _, _ = self.exp.sample(
                batch=self.batch_size)

            if self.replaceCounter % 500 == 0:  # THIS IS SET TO 200 FOR BOX PUSH TASK...SLOPE IS 500
                self.tarNet.load_state_dict(self.valueNet.state_dict())
                self.replaceCounter = 0

            qValues = self.valueNet(torch.FloatTensor(states).squeeze(
                1))  #pass in. Processing implied
            q = torch.gather(qValues, 1,
                             torch.LongTensor(actions).unsqueeze(
                                 1))  #get q values of actions
            qnext = self.tarNet(torch.FloatTensor(nextStates))
            qnext = qnext.squeeze(1).detach()  #pass in

            if self.double:
                qNextDouble = self.valueNet(torch.FloatTensor(nextStates))
                qNextDouble = qNextDouble.squeeze(1).detach()  #pass in
                qnext = torch.gather(
                    qnext, 1,
                    torch.LongTensor(qNextDouble.argmax(1).unsqueeze(1)))
                qtar = torch.FloatTensor(rewards).squeeze(
                    1
                ) + self.discount * torch.Tensor(masks).unsqueeze(1) * qnext
            else:
                qtar = torch.FloatTensor(
                    rewards) + self.discount * torch.Tensor(
                        masks).unsqueeze(1) * qnext.max(1)[0].view(
                            self.batch_size, 1)  #calculate target

            val_loss = self.valueNet.get_loss(q, qtar)
            self.valueNet.optimizer.zero_grad()
            val_loss.backward()
            self.valueNet.optimizer.step()

            self.replaceCounter += 1
            self.totalSteps += 1