class HIRO(object): def __init__(self, params, name, task): self.name = name self.task = task self.vPars = params['valPars'] self.vTrain = params['valTrain'] self.mPars = params['mPars'] self.mTrain = params['mTrain'] self.wPars = params['actPars'] self.wTrain = params['actTrain'] self.w_vPars = params['w_vPars'] self.w_vTrain = params['w_vTrain'] self.agents = params['agents'] self.pubs = {} for key in self.agents.keys(): bot = self.agents[key] self.pubs[key] = rospy.Publisher(bot['pub'], Vector3, queue_size = 1) rospy.Subscriber("/finished", Int8, self.receiveDone, queue_size = 1) self.valueLoss = [] self.manager = Network(self.mPars, self.mTrain) self.m_critic = Network(self.vPars, self.vTrain) self.m_critic_target= Network(self.vPars, self.vTrain) self.worker = Network(self.wPars, self.wTrain) self.w_critic = Network(self.w_vPars, self.w_vTrain) self.w_critic_target= Network(self.w_vPars, self.w_vTrain) self.m_discount = self.vTrain['m_gamma'] self.w_discount = self.vTrain['w_gamma'] self.lr = self.vTrain['lr'] self.trainMode = self.vPars['trainMode'] self.step = self.vTrain['step'] self.stop = False self.c = self.mTrain['c'] self.tau = .005 self.noise = Noise(self.manager.neurons[-1], theta = .4, max_sigma = .2, min_sigma = 0, decay = 1) self.exp = Memory() self.temp = [] self.totalSteps = 0 self.soft = nn.Softmax(dim=1) self.reset() task.initAgent(self) while(not self.stop): x = 1+1 task.postTraining() def receiveDone(self, message): if message.data == 1: #all iterations are done. Check manager.py self.stop = True if message.data == 2: #timed out. Check manager.py self.task.restartProtocol(restart = 1) def get_action(self, s, s_w = None): s = torch.FloatTensor(s) if self.iteration % self.c == 0: self.goal = self.manager(s) noise = torch.FloatTensor(self.noise.get_noise()) self.goal += noise else: self.goal = self.prevState[:,:2] + self.goal - s[:,:2] self.temp_second = self.temp_first self.temp_first = self.goal self.prevState = s s = s[:,:6] inpt = torch.cat((s, self.goal), dim=1) policy = self.worker(inpt) policy = self.soft(policy) choice = np.asscalar(self.choose(policy)) self.iteration += 1 return choice #single env def choose(self, policies): m = Categorical(policies) action = m.sample() action = action.data.cpu().numpy() return action def saveModel(self): pass def store(self, s, a, r, sprime, aprime, done): if self.temp_second != None: self.temp.append(Transition(s, a, r, 1-done, sprime, None, self.temp_second.detach().numpy(), self.goal.detach().numpy())) if self.iteration % self.c == 1 and self.iteration != 1: # remember, we push at 1 because we incremented in get_action self.temp = Transition(*zip(*self.temp)) self.exp.push(self.temp) # store into exp self.temp = [] def reset(self): self.iteration = 0 self.temp_first, self.temp_second = (None, None) self.prevState = None self.temp = [] return def generateSamples(self, goals, states, next_states): next_states = next_states[:, :2] states = states[:, :2] candidates = (next_states - states).unsqueeze(0) candidates = torch.cat((candidates, goals.unsqueeze(0)), dim=0) normal = Normal(next_states - states, torch.ones(next_states.size()) / 2) sampled = normal.sample((8,)) candidates = torch.cat((candidates, sampled), dim=0) # return shape (# candidates, batch_size, dimensions of goal) return candidates def getTransitions(self, initial_goals, states, next_states): # initial_goals shape: (# candidates ,batch_size, dimensions of goal) # states shape: (batch_size, c, dimensions of state) states = states[:,:, :2] next_states = next_states[:,:,:2] goals = [initial_goals.unsqueeze(0)] for c in range(self.c - 1): prev = goals[-1].squeeze(0) curr = states[:, c, :] + prev - next_states[:,c,:] # broadcast. This should take shape of initial_goals goals.append(curr.unsqueeze(0)) goals = torch.cat(goals, dim=0) goals = goals.transpose(0,1) goals = goals.transpose(1,2) # return shape (# candidates, batch_size, c, dimensions of goal) return goals def getProbabilities(self, transitions, states, actions): # transitions shape (# candidates, batch_size, c, dimensions of goal) # states shape: (batch_size, c, dimensions of state) # actions shape: (batch_size, c) states = states[:, :, :6] states = states.unsqueeze(0) size = states.size() states = states.expand(transitions.size()[0], size[1], size[2], size[3]) inpt = torch.cat((states, transitions), dim=3) soft = nn.Softmax(dim = 3) actions = actions.expand(transitions.size()[0], actions.size()[0], actions.size()[1]).unsqueeze(3) probs = soft(self.worker(inpt)).gather(3, actions.long()).squeeze(3) probs = torch.prod(probs, dim=2) # return shape (# candidates, batch_size) of probabilities return probs def train(self): if len(self.exp) > 300: groups = self.exp.sample(self.step) # sample groupings of samples m_states = torch.cat(map(lambda g: torch.Tensor(g.state[0]), groups), dim=0) m_next_states = torch.cat(map(lambda g: torch.Tensor(g.next_state[-1]), groups), dim=0) m_goals = torch.cat(map(lambda g: torch.Tensor(g.goal[0]), groups), dim=0) m_rewards = torch.Tensor(map(lambda g: sum(g.reward), groups)).squeeze(2) m_masks = torch.Tensor(map(lambda g: g.mask[-1], groups)).unsqueeze(1) w_states = torch.cat(map(lambda g: torch.Tensor(g.state).unsqueeze(0), groups), dim=0).squeeze() w_next_states = torch.cat(map(lambda g: torch.Tensor(g.next_state).unsqueeze(0), groups), dim=0).squeeze() w_actions = torch.cat(map(lambda g: torch.Tensor(g.action).unsqueeze(0), groups), dim=0) candidates = self.generateSamples(m_goals, m_states, m_next_states) cand_transitions = self.getTransitions(candidates, w_states, w_next_states) probs = self.getProbabilities(cand_transitions, w_states, w_actions) cand_indices = probs.argmax(dim=0).unsqueeze(0).unsqueeze(2) cand_indices = cand_indices.expand(cand_indices.size()[0], cand_indices.size()[1], candidates.size()[2]) m_goals = candidates.gather(0, cand_indices).squeeze() #size: (batch_size, dimension of goals) states = [] actions = [] next_states = [] masks = [] goals = [] next_goals = [] for g in groups: states.append(torch.Tensor(g.state).squeeze()[:, :6]) actions.append(torch.Tensor(g.action).squeeze()) next_states.append(torch.Tensor(g.next_state).squeeze()[:, :6]) masks.append(torch.Tensor(g.mask).squeeze()) goals.append(torch.Tensor(g.goal).squeeze()) next_goals.append(torch.Tensor(g.next_goal).squeeze()) states = torch.cat(states, dim=0) actions = torch.cat(actions, dim=0).unsqueeze(1) next_states = torch.cat(next_states, dim=0) masks = torch.cat(masks, dim=0).unsqueeze(1) goals = torch.cat(goals, dim=0) next_goals = torch.cat(next_goals, dim=0) rewards = -torch.norm(states[:,:2] + goals - next_states[:,:2], dim=1).unsqueeze(1) # Manager critic q = self.m_critic(torch.cat((m_states, m_goals), dim=1)) m_next_actions = self.manager(m_next_states) q_tar = m_rewards + self.m_discount * self.m_critic_target(torch.cat((m_next_states, m_next_actions), dim=1)) loss = self.m_critic.get_loss(q, q_tar.detach()) self.m_critic.optimizer.zero_grad() loss.backward() self.m_critic.optimizer.step() # Manager actor new_actions = self.manager(m_states) q = self.m_critic(torch.cat((m_states, new_actions), dim=1)) loss = -q.mean() self.manager.optimizer.zero_grad() loss.backward() self.m_critic.optimizer.step() # Worker critic q = self.w_critic(torch.cat((states, goals), dim=1)).gather(1, actions.long()) next_actions = self.worker(torch.cat((next_states, next_goals), dim=1)) next_actions = self.choose(self.soft(next_actions)) q_tar = rewards + self.w_discount * masks * self.w_critic_target(torch.cat((next_states, next_goals), dim=1)).gather(1, torch.Tensor(next_actions).long().unsqueeze(1)) loss = self.w_critic.get_loss(q, q_tar.detach()) self.w_critic.optimizer.zero_grad() loss.backward() self.w_critic.optimizer.step() # Worker actor new_actions = self.worker(torch.cat((states[:,:6], goals), dim=1)) policy = self.soft(new_actions) new_actions = self.choose(policy) q = self.w_critic(torch.cat((states, goals), dim=1)) q = q.gather(1, torch.Tensor(new_actions).long().unsqueeze(1)) loss = -q.mean() self.worker.optimizer.zero_grad() loss.backward() self.worker.optimizer.step() for target_param, param in zip(self.m_critic_target.parameters(), self.m_critic.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau) for target_param, param in zip(self.w_critic_target.parameters(), self.w_critic.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau) # Push updated replay entires into replay for i, goal in enumerate(m_goals): curr_group = groups[i] curr_goal = goal.unsqueeze(0).detach().numpy() inserts = (curr_goal) for j in range(self.c - 1): curr_goal = curr_group.state[j][:,:2].reshape(1,-1) + curr_goal - curr_group.next_state[j][:,:2].reshape(1,-1) inserts = inserts + (curr_goal) curr_group._replace(goal=inserts) self.exp.push(curr_group) self.totalSteps += 1 return loss
class DoubleQ(Agent): def __init__(self, params, name, task, load_path=None): super(DoubleQ, self).__init__(params, name, task) self.dual = self.vPars['dual'] if self.trainMode: if self.dual: self.tarNet = DualNetwork(self.vPars, self.vTrain) self.valueNet = DualNetwork(self.vPars, self.vTrain) else: self.tarNet = Network(self.vPars, self.vTrain) self.valueNet = Network(self.vPars, self.vTrain) for target_param, param in zip(self.tarNet.parameters(), self.valueNet.parameters()): target_param.data.copy_(param.data) else: self.valueNet = Network(self.vPars, self.vTrain) self.valueNet.load_state_dict(torch.load(load_path)) self.out_n = self.vPars['neurons'][-1] self.replaceCounter = 0 self.valueLoss = [] self.avgLoss = 0 self.expSize = self.vTrain['buffer'] self.exp = Memory(size=self.expSize) self.beta = self.vPars['beta'] self.priority = self.vTrain['priority'] self.priorities = [] self.alpha = .7 self.double = self.vTrain['double'] self.update_target_network = self.vTrain['update_target_network_every'] if 'noise' in self.vTrain: self.noise = self.vTrain['noise'] else: self.noise = 0 task.initAgent(self) if not load_path: while (not self.stop): x = 1 + 1 task.postTraining() def saveModel(self): torch.save( self.valueNet.state_dict(), '/home/jimmy/Documents/Research/AN_Bridging/results/hierarchical_q_policy2.txt' ) pass def store(self, s, a, r, sprime, aprime, done): self.exp.push(s, a, r, 1 - done, aprime, sprime) if len(self.priorities) < self.expSize: self.priorities.append(1) else: self.priorities = self.priorities[1:] self.priorities.append(1) def get_q(self, s): if type(self.valueNet) == list: model_index = np.random.randint(len(self.valueNet)) net = self.valueNet[model_index] else: net = self.valueNet q = net(torch.FloatTensor(s)) q = q.detach() return q def get_action(self, s, testing_time=False, probabilistic=False): i = np.random.random() if i < self.explore and self.trainMode and not testing_time: index = np.random.randint(self.out_n) else: q = self.get_q(s) if probabilistic: q = q.numpy() q = q - np.max(q) probs = np.exp(q * self.beta) probs = probs / np.sum(probs) index = np.random.choice(q.size, p=probs.ravel()) # print('probability chosen ', probs.ravel()[index]) else: index = np.argmax(q.numpy()) self.explore = max(.2, self.explore * .9997) return index def get_q_and_q_tar(self, states, actions, nextStates, rewards, masks): qValues = self.valueNet( torch.FloatTensor(states).squeeze(1)) #pass in. Processing implied q = torch.gather( qValues, 1, torch.LongTensor(actions).unsqueeze(1)) #get q values of actions qnext = self.tarNet(torch.FloatTensor(nextStates)) qnext = qnext.squeeze(1).detach() #pass in if self.double: qNextDouble = self.valueNet(torch.FloatTensor(nextStates)) qNextDouble = qNextDouble.squeeze(1).detach() #pass in qnext = torch.gather( qnext, 1, torch.LongTensor(qNextDouble.argmax(1).unsqueeze(1))) qtar = torch.FloatTensor(rewards).squeeze( 1) + self.discount * torch.Tensor(masks).unsqueeze(1) * qnext else: qtar = torch.FloatTensor(rewards) + self.discount * torch.Tensor( masks).unsqueeze(1) * qnext.max(1)[0].view( self.batch_size, 1) #calculate target return q, qtar def train(self, override=False): if len(self.exp) >= 500 or override: if self.priority: loss = 0 weights = [] errors = [] assert len(self.priorities) == len(self.exp) for i in range(self.batch_size): probs = np.array( [math.pow(p, self.alpha) for p in self.priorities]) probs = probs / np.sum(probs) choice = np.random.choice(len(self.priorities), p=probs, size=1) weights.append( math.pow( len(self.priorities) * self.priorities[int(np.asscalar(choice))], -self.beta)) states, actions, rewards, masks, _, nextStates, _, _, _ = self.exp.get_transitions( choice) q, qtar = self.get_q_and_q_tar(states, actions, nextStates, rewards, masks) td = qtar - q self.priorities[int(np.asscalar(choice))] = abs(td[:, 0]) errors.append(self.valueNet.get_loss(q, qtar)) max_weight = max(weights) weights = [w / max_weight for w in weights] val_loss = sum([w * e for w, e in zip(weights, errors)]) else: states, actions, rewards, masks, _, nextStates, _, _, _ = self.exp.sample( batch=self.batch_size) if self.replaceCounter % self.update_target_network == 0: self.tarNet.load_state_dict(self.valueNet.state_dict()) self.replaceCounter = 0 if self.noise: states = np.array(states) states = states + np.random.normal(0, self.noise, states.shape) q, qtar = self.get_q_and_q_tar(states, actions, nextStates, rewards, masks) val_loss = self.valueNet.get_loss(q, qtar) self.valueNet.optimizer.zero_grad() val_loss.backward() self.valueNet.optimizer.step() self.replaceCounter += 1 self.totalSteps += 1 return val_loss
class CounterContinuous(object): def __init__(self, params, name, task): self.name = name self.task = task self.vTrain = params['valTrain'] self.vPars = params['valPars'] self.aTrain = params['actTrain'] self.aPars = params['actPars'] self.agents = params['agents'] self.pubs = {} for key in self.agents.keys(): bot = self.agents[key] self.pubs[key] = rospy.Publisher(bot['pub'], Vector3, queue_size=1) rospy.Subscriber("/finished", Int8, self.receiveDone, queue_size=1) self.valueLoss = [] self.actorLoss = [] self.h_state_n = self.aPars['h_state_n'] self.x_state_n = self.aPars['x_state_n'] self.u_n = self.aPars['u_n'] self.clip_grad_norm = self.aTrain['clip'] self.homogenous = self.aPars['share_params'] self.critic = Network(self.vPars, self.vTrain).to(device) self.target = Network(self.vPars, self.vTrain).to(device) if self.homogenous: self.actor = SoftNetwork(self.aPars, self.aTrain).to(device) else: self.actor = [ SoftNetwork(self.aPars, self.aTrain) for i in range(len(self.agents)) ] for target_param, param in zip(self.target.parameters(), self.critic.parameters()): target_param.data.copy_(param.data) self.clip_grad_norm = self.aTrain['clip'] self.trainMode = self.vPars['trainMode'] self.batch_size = self.vTrain['batch_size'] self.discount = self.vTrain['gamma'] self.range = self.aPars['mean_range'] self.td_lambda = .8 self.tau = .005 self.lower_bound = self.aTrain['clamp'][2] self.stop = False self.trained = False self.exp = Memory() self.totalSteps = 0 self.reset() task.initAgent(self) while (not self.stop): x = 1 + 1 task.postTraining() def receiveDone(self, message): if message.data == 1: #all iterations are done. Check manager.py self.stop = True if message.data == 2: #timed out. Check manager.py self.task.restartProtocol(restart=1) def get_action(self, s_true, s_split): if self.homogenous: a1, log_prob1, z, mu1, log_std1 = self.actor( torch.FloatTensor(s_split[0])) a2, log_prob2, z, mu2, log_std2 = self.actor( torch.FloatTensor(s_split[1])) else: # TODO: Fix this below: a1, h_new1, log_prob1, mu1, std1 = self.actor[0](torch.FloatTensor( s_split[0]), self.h[0]) a2, h_new2, log_prob2, mu2, std2 = self.actor[1](torch.FloatTensor( s_split[1]), self.h[1]) action = [a1.detach().numpy().ravel(), a2.detach().numpy().ravel()] return [a1, a2] def choose(self, policies): m = Categorical(policies) action = m.sample() action = action.data.cpu().numpy() return np.asscalar(action) def saveModel(self): pass def store(self, s, a, r, sprime, aprime, done, local, next_local): self.exp.push(s, a, r, 1 - done, aprime, sprime, local, next_local) def reset(self): curr = self.actor.clamp[0] if self.trained: new = max(self.lower_bound, .05 * self.lower_bound + .95 * curr) self.actor.clamp = (new, self.actor.clamp[1], self.lower_bound) self.trained = False return def get_grad_norm(self, model): total_norm = 0 for p in model.parameters(): if p.grad is None: continue param_norm = p.grad.data.norm(2) total_norm += param_norm.item()**2 grad_norm = total_norm**(1. / 2) return grad_norm def get_lambda_targets(self, rewards, mask, gamma, target_qs): target_qs = target_qs.squeeze() ret = target_qs.new_zeros(*target_qs.shape) ret[-1] = target_qs[-1] * mask[-1] for t in range(ret.size()[0] - 2, -1, -1): ret[t] = self.td_lambda * gamma * ret[t + 1] + \ mask[t] * (rewards[t] + (1 - self.td_lambda) * gamma * target_qs[t + 1]) return ret.unsqueeze(1) def zipStack(self, data): data = zip(*data) data = [torch.stack(d).squeeze().to(device) for d in data] return data def monte_carlo(self, mean, std, n=500): # returns tensors representing n sampled from mean and std normal = Normal(mean, std) return normal.sample((n, )) def train(self, episode_done=False): if len(self.exp) >= 500: transition = self.exp.sample(self.batch_size) states = torch.squeeze(torch.Tensor(transition.state)).to(device) actions = self.zipStack(transition.action) rewards = torch.Tensor(transition.reward).to(device) states_next = torch.squeeze(torch.Tensor( transition.next_state)).to(device) masks = torch.Tensor(transition.mask).to(device) local = self.zipStack(transition.local) next_local = self.zipStack(transition.next_local) actions_next = [] for s in next_local: a, log_prob, _, _, _ = self.actor(s) actions_next.append(a.detach()) inp = torch.cat((states_next, actions_next[0], actions_next[1]), dim=1) q_tar = rewards.unsqueeze( 1) + self.discount * masks.unsqueeze(1) * self.target(inp) inp = torch.cat((states, actions[0].detach(), actions[1].detach()), dim=1) q = self.critic(inp) loss = self.critic.get_loss(q, q_tar.detach()) self.critic.optimizer.zero_grad() loss.backward() self.critic.optimizer.step() self.valueLoss.append(loss) actor_loss = 0 actions = [] means = [] log_stds = [] log_probs = [] for s in local: a, log_prob, z, mu, log_std = self.actor(s) actions.append(a) means.append(mu) log_stds.append(log_std) log_probs.append(log_prob) # train first agent inp = torch.cat((states, actions[0], actions[1].detach()), dim=1) q_out = self.critic(inp) samples = self.monte_carlo(means[0], log_stds[0].exp()) samples = self.range * torch.tanh(samples) repeat_s = states.unsqueeze(0) repeat_s = repeat_s.expand(samples.size()[0], repeat_s.size()[1], repeat_s.size()[2]) repeat_a = actions[1].unsqueeze(0) repeat_a = repeat_a.expand(samples.size()[0], repeat_a.size()[1], repeat_a.size()[2]) inp = torch.cat((repeat_s, samples, repeat_a), dim=2) baseline = self.critic(inp).mean(0) coma = (q_out - baseline).detach() actor_loss -= (log_probs[0].view(coma.size()) * (coma)).mean() # train second agent inp = torch.cat((states, actions[0].detach(), actions[1]), dim=1) q_out = self.critic(inp) samples = self.monte_carlo(means[1], log_stds[1].exp()) samples = self.range * torch.tanh(samples) repeat_a = actions[0].unsqueeze(0) repeat_a = repeat_a.expand(samples.size()[0], repeat_a.size()[1], repeat_a.size()[2]) inp = torch.cat((repeat_s, repeat_a, samples), dim=2) baseline = self.critic(inp).mean(0) coma = (q_out - baseline).detach() actor_loss -= (log_probs[1].view(coma.size()) * (coma)).mean() if self.homogenous: self.actor.optimizer.zero_grad() actor_loss.backward() self.actor.optimizer.step() else: for actor in self.actor: actor.optimizer.zero_grad() actor_loss.backward() for actor in self.actor: torch.nn.utils.clip_grad_norm_(actor.parameters(), self.clip_grad_norm) actor.optimizer.step() self.totalSteps += 1 self.trained = True #UPDATE TARGET NETWORK: if self.totalSteps % 50 == 0: for target_param, param in zip(self.target.parameters(), self.critic.parameters()): target_param.data.copy_(param.data) return
class SAC(Agent): def __init__(self, params, name, task): super(SAC, self).__init__(params, name, task) self.aPars = params['actPars'] self.aTrain = params['actTrain'] self.qPars = params['qPars'] self.qTrain = params['qTrain'] if self.trainMode: self.QNet = Network(self.qPars, self.qTrain).to(device) self.VNet = Network(self.vPars, self.vTrain).to(device) self.VTar = Network(self.vPars, self.vTrain).to(device) self.policyNet = SoftNetwork(self.aPars, self.aTrain).to(device) else: print('Not implemented') for target_param, param in zip(self.VTar.parameters(), self.VNet.parameters()): target_param.data.copy_(param) self.expSize = self.vTrain['buffer'] self.actions = self.aPars['neurons'][-1] self.state = self.aPars['neurons'][0] self.exp = ReplayBuffer(self.expSize, self.actions, np.float32, self.state, np.float32) task.initAgent(self) while (not self.stop): x = 1 + 1 task.postTraining() def load_nets(self): pass def saveModel(self): pass def get_action(self, s): action, _, _, _, _ = self.policyNet(torch.FloatTensor(s)) action = np.ravel(action.detach().numpy()) return action def send_to_device(self, s, a, r, next_s, d): s = torch.FloatTensor(s).to(device) a = torch.FloatTensor(a).to(device) r = torch.FloatTensor(r).unsqueeze(1).to(device) next_s = torch.FloatTensor(next_s).to(device) d = torch.FloatTensor(np.float32(d)).unsqueeze(1).to(device) return s, a, r, next_s, d def train(self): if len(self.exp) > 750: s, a, r, next_s, d = self.exp.sample_batch(self.batch_size) s, a, r, next_s, d = self.send_to_device(s, a, r, next_s, d) q = self.QNet(torch.cat([s, a], dim=1)) v = self.VNet(s) new_a, log_prob, z, mean, log_std = self.policyNet(s) target_v = self.VTar(next_s) next_q = r + (1 - d) * self.discount * target_v q_loss = self.QNet.get_loss(q, next_q.detach()) new_q = self.QNet(torch.cat([s, new_a], dim=1)) next_v = new_q - log_prob * self.alpha v_loss = self.VNet.get_loss(v, next_v.detach()) target = new_q - v actor_loss = (log_prob * (log_prob * self.alpha - target).detach()).mean() mean_loss = 1e-3 * mean.pow(2).mean() std_loss = 1e-3 * log_std.pow(2).mean() actor_loss += mean_loss + std_loss self.VNet.optimizer.zero_grad() v_loss.backward() self.VNet.optimizer.step() self.QNet.optimizer.zero_grad() q_loss.backward() self.QNet.optimizer.step() self.policyNet.optimizer.zero_grad() actor_loss.backward() self.policyNet.optimizer.step() for target_param, param in zip(self.VTar.parameters(), self.VNet.parameters()): target_param.data.copy_(target_param.data * (1.0 - 5 * 1e-3) + param.data * 5 * 1e-3) self.totalSteps += 1
class Counter(object): def __init__(self, params, name, task): self.name = name self.task = task self.vTrain = params['valTrain'] self.vPars = params['valPars'] self.aTrain = params['actTrain'] self.aPars = params['actPars'] self.agents = params['agents'] self.pubs = OrderedDict() for key in self.agents.keys(): bot = self.agents[key] self.pubs[key] = rospy.Publisher(bot['pub'], Vector3, queue_size=1) rospy.Subscriber("/finished", Int8, self.receiveDone, queue_size=1) self.valueLoss = [] self.actorLoss = [] self.h_state_n = self.aPars['h_state_n'] self.x_state_n = self.aPars['x_state_n'] self.u_n = self.aPars['u_n'] self.clip_grad_norm = self.aTrain['clip'] self.homogenous = self.aPars['share_params'] self.critic = Network(self.vPars, self.vTrain).to(device) self.target = Network(self.vPars, self.vTrain).to(device) if self.homogenous: self.actor = CounterActor(self.aPars, self.aTrain).to(device) else: self.actor = [ CounterActor(self.aPars, self.aTrain) for i in range(len(self.agents)) ] for target_param, param in zip(self.target.parameters(), self.critic.parameters()): target_param.data.copy_(param.data) self.clip_grad_norm = self.aTrain['clip'] self.trainMode = self.vPars['trainMode'] self.batch_size = self.vTrain['batch'] self.discount = self.vTrain['gamma'] self.temp_second = None self.temp_first = None self.td_lambda = 0 # TEST: this is because we are doing ER off-policy self.tau = .01 self.stop = False self.trained = False self.exp = Memory() self.totalSteps = 0 self.reset() task.initAgent(self) while (not self.stop): x = 1 + 1 task.postTraining() def receiveDone(self, message): if message.data == 1: #all iterations are done. Check manager.py self.stop = True if message.data == 2: #timed out. Check manager.py self.task.restartProtocol(restart=1) def get_action(self, s_true, s_split): if self.homogenous: policy1 = self.actor(torch.FloatTensor(s_split[0])) a1 = np.asscalar(self.choose(policy1)) policy2 = self.actor(torch.FloatTensor(s_split[1])) a2 = np.asscalar(self.choose(policy2)) else: policy1 = self.actor[0](torch.FloatTensor(s_split[0])) a1 = self.choose(policy1) policy2 = self.actor[1](torch.FloatTensor(s_split[1])) a2 = self.choose(policy2) # THIS IS A TEST a1 = 0 #print(policy1) #print(policy2) #print('') return [a1, a2] def choose(self, policies): m = Categorical(policies) action = m.sample() action = action.data.cpu().numpy() return action def saveModel(self): pass def store(self, s, a, r, sprime, aprime, done, local, next_local): self.exp.push(s, a, r, 1 - done, aprime, sprime, local, next_local, None) def reset(self): self.train(True) if self.trained: self.actor.eps = max(.05, self.actor.eps - .003) self.trained = False self.temp_first, self.temp_second = (None, None) self.h = [ torch.zeros((1, 1, self.h_state_n)) for i in range(len(self.agents)) ] self.prevAction = [-1, -1] return def zipStack(self, data): data = zip(*data) data = [torch.stack(d).squeeze().to(device) for d in data] return data def get_lambda_targets(self, rewards, mask, gamma, target_qs): target_qs = target_qs.squeeze() ret = target_qs.new_zeros(*target_qs.shape) ret[-1] = rewards[-1] + target_qs[-1] * mask[-1] for t in range(ret.size()[0] - 2, -1, -1): ret[t] = mask[t] * (self.td_lambda * gamma * ret[t + 1]) + ( rewards[t] + (1 - self.td_lambda) * gamma * target_qs[t] * mask[t]) return ret.unsqueeze(1) def train(self, episode_done=False): if len(self.exp) > self.batch_size: transition = self.exp.sample(self.batch_size) states = torch.squeeze(torch.Tensor(transition.state)).to(device) states_next = torch.squeeze(torch.Tensor( transition.next_state)).to(device) actions = torch.Tensor(transition.action).float().to(device) rewards = torch.Tensor(transition.reward).to(device) masks = torch.Tensor(transition.mask).to(device) local = self.zipStack(transition.local) next_local = self.zipStack(transition.next_local) actions_next = [] for s in next_local: next_policy = self.actor(s) next_action = self.choose(next_policy) actions_next.append(torch.Tensor(next_action)) '''# Critic Update ID = torch.Tensor(states.size()[0], 1).fill_(-1) inp = torch.cat((states_next, actions_next[1].unsqueeze(1), ID), dim = 1) q_tar = self.target(inp).detach().gather(1, actions_next[0].long().unsqueeze(1)) q_tar = self.get_lambda_targets(rewards.squeeze(), masks.squeeze(), self.discount, q_tar) inp = torch.cat((states, actions[:, 1].unsqueeze(1), ID), dim = 1) q = self.critic(inp) q = q.gather(1, actions[:, 0].long().unsqueeze(1)) loss = self.critic.get_loss(q, q_tar) self.critic.optimizer.zero_grad() loss.backward() self.critic.optimizer.step()''' ID = torch.Tensor(states.size()[0], 1).fill_(1) inp = torch.cat((states_next, actions_next[0].unsqueeze(1), ID), dim=1) q_tar = self.target(inp).detach().gather( 1, actions_next[1].long().unsqueeze(1)) # .max(1)? q_tar = self.get_lambda_targets(rewards.squeeze(), masks.squeeze(), self.discount, q_tar) inp = torch.cat((states, actions[:, 0].unsqueeze(1), ID), dim=1) q = self.critic(inp) q = q.gather(1, actions[:, 1].long().unsqueeze(1)) loss = self.critic.get_loss(q, q_tar) self.critic.optimizer.zero_grad() loss.backward() self.critic.optimizer.step() actor_loss = 0 # Actor Update. Consider doing new_actions policies = [] new_actions = [] for s in local: policy = self.actor(s) policies.append(policy) new_action = self.choose(policy) new_actions.append(torch.Tensor(new_action)) '''ID = torch.Tensor(states.size()[0], 1).fill_(-1) inp = torch.cat((states, new_actions[1].unsqueeze(1), ID), dim = 1) q_out = self.critic(inp) #batch x num_actions policy = policies[0] #batch x num_actions mult = q_out * policy baseline = torch.sum(mult, 1).unsqueeze(1) q_taken = q_out.gather(1, new_actions[0].long().unsqueeze(1)) coma = (q_taken - baseline).detach() probs_taken = policy.gather(1, new_actions[0].long().unsqueeze(1)) loss = -(torch.log(probs_taken) * coma).mean() actor_loss += loss ''' ID = torch.Tensor(states.size()[0], 1).fill_(1) inp = torch.cat((states, new_actions[0].unsqueeze(1), ID), dim=1) q_out = self.critic(inp) #batch x num_actions policy = policies[1] #batch x num_actions mult = q_out * policy baseline = torch.sum(mult, 1).unsqueeze(1) q_taken = q_out.gather(1, new_actions[1].long().unsqueeze(1)) coma = (q_taken - baseline).detach() probs_taken = policy.gather(1, new_actions[1].long().unsqueeze(1)) loss = -(torch.log(probs_taken) * coma).mean() actor_loss += loss self.actorLoss.append(actor_loss) if self.homogenous: self.actor.optimizer.zero_grad() actor_loss.backward() nn.utils.clip_grad_norm_(self.actor.parameters(), 1) self.actor.optimizer.step() else: for actor in self.actor: actor.optimizer.zero_grad() actor_loss.backward() for actor in self.actor: actor.optimizer.step() self.totalSteps += 1 # self.exp = Memory() self.trained = True #UPDATE TARGET NETWORK: if self.totalSteps % 1 == 0: # THIS IS A TEST for target_param, param in zip(self.target.parameters(), self.critic.parameters()): target_param.data.copy_((1 - self.tau) * target_param + self.tau * param.data) return
class DoubleQ(Agent): def __init__(self, params, name, task, load_path=None): super(DoubleQ, self).__init__(params, name, task) self.dual = self.vPars['dual'] if self.trainMode: if self.dual: self.tarNet = DualNetwork(self.vPars, self.vTrain) self.valueNet = DualNetwork(self.vPars, self.vTrain) else: self.tarNet = Network(self.vPars, self.vTrain) self.valueNet = Network(self.vPars, self.vTrain) for target_param, param in zip(self.tarNet.parameters(), self.valueNet.parameters()): target_param.data.copy_(param.data) else: self.valueNet = Network(self.vPars, self.vTrain) paths = [ '/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/push_in_hole.txt', '/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/push_in_hole2.txt' ] if not load_path: self.valueNet = [] for path in paths: self.valueNet.append(Network(self.vPars, self.vTrain)) self.valueNet[-1].load_state_dict(torch.load(path)) else: self.valueNet.load_state_dict(torch.load(load_path)) self.out_n = self.vPars['neurons'][-1] self.replaceCounter = 0 self.valueLoss = [] self.avgLoss = 0 self.expSize = self.vTrain['buffer'] self.exp = Memory(size=self.expSize) self.double = self.vTrain['double'] task.initAgent(self) if not load_path: while (not self.stop): x = 1 + 1 task.postTraining() def saveModel(self): torch.save( self.valueNet.state_dict(), '/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/box_push_hierarchical_q_policy.txt' ) pass def store(self, s, a, r, sprime, aprime, done): self.exp.push(s, a, r, 1 - done, aprime, sprime) def get_action(self, s): i = np.random.random() if i < self.explore and self.trainMode: index = np.random.randint(self.out_n) else: if type(self.valueNet) == list: model_index = np.random.randint(len(self.valueNet)) net = self.valueNet[model_index] else: net = self.valueNet q = net(torch.FloatTensor(s)) #print(q) q = q.detach() index = np.argmax(q.numpy()) self.explore = max(.1, self.explore * .9997) return index def train(self): if len(self.exp) >= 500: states, actions, rewards, masks, _, nextStates, _, _, _ = self.exp.sample( batch=self.batch_size) if self.replaceCounter % 500 == 0: # THIS IS SET TO 200 FOR BOX PUSH TASK...SLOPE IS 500 self.tarNet.load_state_dict(self.valueNet.state_dict()) self.replaceCounter = 0 qValues = self.valueNet(torch.FloatTensor(states).squeeze( 1)) #pass in. Processing implied q = torch.gather(qValues, 1, torch.LongTensor(actions).unsqueeze( 1)) #get q values of actions qnext = self.tarNet(torch.FloatTensor(nextStates)) qnext = qnext.squeeze(1).detach() #pass in if self.double: qNextDouble = self.valueNet(torch.FloatTensor(nextStates)) qNextDouble = qNextDouble.squeeze(1).detach() #pass in qnext = torch.gather( qnext, 1, torch.LongTensor(qNextDouble.argmax(1).unsqueeze(1))) qtar = torch.FloatTensor(rewards).squeeze( 1 ) + self.discount * torch.Tensor(masks).unsqueeze(1) * qnext else: qtar = torch.FloatTensor( rewards) + self.discount * torch.Tensor( masks).unsqueeze(1) * qnext.max(1)[0].view( self.batch_size, 1) #calculate target val_loss = self.valueNet.get_loss(q, qtar) self.valueNet.optimizer.zero_grad() val_loss.backward() self.valueNet.optimizer.step() self.replaceCounter += 1 self.totalSteps += 1
class Twin_DDPG(Agent): def __init__(self, params, name, task): super(Twin_DDPG, self).__init__(params, name, task) self.aPars = params['actPars'] self.aTrain = params['actTrain'] if self.trainMode: self.values = [ Network(self.vPars, self.vTrain), Network(self.vPars, self.vTrain) ] self.policyNet = TD3Network(self.aPars, self.aTrain) self.tarPolicy = TD3Network(self.aPars, self.aTrain) if self.load: self.load_nets() self.tarPolicy.load_state_dict(self.policyNet.state_dict()) self.tar = [ Network(self.vPars, self.vTrain), Network(self.vPars, self.vTrain) ] for i in range(len(self.values)): self.tar[i].load_state_dict(self.values[i].state_dict()) else: self.policyNet = Network(self.aPars, self.aTrain) self.policyNet.load_state_dict( torch.load( "/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/TD3_goal_policy2.txt" )) self.base = self.vTrain['baseExplore'] self.step = self.vTrain['decay'] self.expSize = self.vTrain['buffer'] self.exp = Replay(self.expSize) self.a = self.vTrain['a'] self.tau = self.vPars['tau'] self.smooth = self.vTrain['smooth'] self.clip = self.vTrain['clip'] self.delay = self.vTrain['policy_delay'] self.mean_range = self.aPars['mean_range'] self.noise = OUNoise(self.out_n, mu=0, theta=.15, max_sigma=self.explore, min_sigma=self.base, decay=self.step) self.valueLoss = [] self.actorLoss = [] self.avgLoss = 0 self.avgActLoss = 0 task.initAgent(self) while (not self.stop): x = 1 + 1 task.postTraining() def load_nets(self): path = "/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/TD3_goal3_" self.policyNet.load_state_dict(torch.load(path + "policy.txt")) self.values[0].load_state_dict(torch.load(path + "Qvalue1.txt")) self.values[1].load_state_dict(torch.load(path + "Qvalue2.txt")) def saveModel(self): path = "/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/TD3_goal3_" torch.save(self.policyNet.state_dict(), path + "policy.txt") torch.save(self.values[0].state_dict(), path + "Qvalue1.txt") torch.save(self.values[1].state_dict(), path + "Qvalue2.txt") print("Network saved") pass def get_action(self): output = self.policyNet(torch.FloatTensor(s)) i = np.random.random() if i < self.explore[0]: #add in exploration TODO: put in OU noise noise = torch.from_numpy(np.random.normal(0, self.explore[1], 2)) output = output + noise output = output.float() return output[0] def train(self): if self.dataSize > 500 and self.trainMode: #iteration updates self.trainIt += 1 self.totalSteps += 1 #Unpack s, a, r, n_s, n_a, done = self.exp.get_data() noise = torch.FloatTensor( np.random.normal(0, self.smooth, n_a.shape)) c = np.random.choice(min(self.dataSize, self.expSize), self.batch_size) s = torch.FloatTensor(s[c]) a = torch.FloatTensor(a[c]) r = torch.FloatTensor(r[c]) n_s = torch.FloatTensor(n_s[c]) done = torch.FloatTensor(done[c]) n_a = self.tarPolicy(n_s).detach().numpy() #target policy smoothing n_a_ = n_a + torch.clamp(noise, -self.clip, self.clip) n_sa = torch.cat((n_s, n_a), dim=1) qtar = torch.FloatTensor(r) + self.discount * ( 1 - done) * torch.min(self.tar[0](n_sa).detach(), self.tar[1] (n_sa).detach()) #pass in #Value update sa = torch.cat((s, a), dim=1) for qnet in self.values: q = qnet(sa) loss = qnet.loss_fnc(q, qtar) qnet.optimizer.zero_grad() loss.backward() qnet.optimizer.step() qnet.scheduler.step() self.avgLoss += loss / len(self.values) #policy update if self.trainIt % self.delay == 0: act = self.policyNet(s) s_a = torch.cat((s, act), 1) q = self.values[0](s_a) policy_loss = -q.mean() self.policyNet.optimizer.zero_grad() policy_loss.backward() self.policyNet.optimizer.step() self.policyNet.scheduler.step() self.avgActLoss += policy_loss for target_param, param in zip(self.tarPolicy.parameters(), self.policyNet.parameters()): target_param.data.copy_(self.tau * param.data + (1.0 - self.tau) * target_param.data) for i in range(len(self.values)): for target_param, param in zip( self.tar[i].parameters(), self.values[i].parameters()): target_param.data.copy_(self.tau * param.data + (1.0 - self.tau) * target_param.data)
class TRPOAgent(Agent): def __init__(self, params, name, task): super(TRPOAgent, self).__init__(params, name, task) self.valueNet = Network(self.vPars, self.vTrain) self.policyNet = Network(params['actPars'], params['actTrain']) self.running_state = ZFilter((self.vPars['in_n'], ), clip=5) self.running_reward = ZFilter((1, ), demean=False, clip=10) self.experience = Memory() self.valueLoss = [] self.actorLoss = [] self.avgLoss = 0 task.initAgent(self) while (not self.stop): x = 1 + 1 task.postTraining() def saveModel(self): torch.save( self.valueNet.state_dict(), "/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/TRPOCritic.txt" ) torch.save( self.policyNet.state_dict(), "/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/TRPOPolicy.txt" ) print("Network saved") def train(self): batch = self.experience.sample() self.update_params(batch) def store(self, prevS, prevA, r, s, a, failure): mask = 0 if failure == 1 else 1 self.experience.push(prevS, prevA, mask, s, r) def update_params(self, batch): rewards = torch.Tensor(batch.reward) masks = torch.Tensor(batch.mask) actions = torch.Tensor(np.concatenate(batch.action, 0)) states = torch.Tensor(batch.state) values = self.valueNet(Variable(states)) returns = torch.Tensor(actions.size(0), 1) deltas = torch.Tensor(actions.size(0), 1) advantages = torch.Tensor(actions.size(0), 1) prev_return = 0 prev_value = 0 prev_advantage = 0 for i in reversed(range(rewards.size(0))): returns[i] = rewards[i] + self.discount * prev_return * masks[i] deltas[i] = rewards[ i] + self.discount * prev_value * masks[i] - values.data[i] advantages[i] = deltas[ i] + self.discount * tau * prev_advantage * masks[i] prev_return = returns[i, 0] prev_value = values.data[i, 0] prev_advantage = advantages[i, 0] targets = Variable(returns) # Original code uses the same LBFGS to optimize the value loss def get_value_loss(flat_params): set_flat_params_to(self.valueNet, torch.Tensor(flat_params)) for param in self.valueNet.parameters(): if param.grad is not None: param.grad.data.fill_(0) values_ = self.valueNet(Variable(states)) value_loss = (values_ - targets).pow(2).mean() # weight decay for param in self.valueNet.parameters(): value_loss += param.pow(2).sum() * l2Reg value_loss.backward() return (value_loss.data.double().numpy(), get_flat_grad_from(self.valueNet).data.double().numpy()) flat_params, _, opt_info = scipy.optimize.fmin_l_bfgs_b( get_value_loss, get_flat_params_from(self.valueNet).double().numpy(), maxiter=25) set_flat_params_to(self.valueNet, torch.Tensor(flat_params)) advantages = (advantages - advantages.mean()) / advantages.std() output = self.policyNet(Variable(states)).view(-1, self.u_n * 2) action_means = output.narrow(1, 0, self.u_n) action_log_stds = output.narrow(1, self.u_n, self.u_n) action_stds = torch.exp(action_log_stds) fixed_log_prob = normal_log_density(Variable(actions), action_means, action_log_stds, action_stds).data.clone() def get_loss(volatile=False): if volatile: with torch.no_grad(): output = self.policyNet(Variable(states)) else: output = self.policyNet(Variable(states)) output = output.view(-1, self.u_n * 2) action_means = output.narrow(1, 0, self.u_n) action_log_stds = output.narrow(1, self.u_n, self.u_n) action_stds = torch.exp(action_log_stds) log_prob = normal_log_density(Variable(actions), action_means, action_log_stds, action_stds) action_loss = -Variable(advantages) * torch.exp( log_prob - Variable(fixed_log_prob)) return action_loss.mean() def get_kl(): output = self.policyNet(Variable(states)) output = output.view(-1, self.u_n * 2) mean1 = output.narrow(1, 0, self.u_n) log_std1 = output.narrow(1, self.u_n, self.u_n) std1 = torch.exp(action_log_stds) mean0 = Variable(mean1.data) log_std0 = Variable(log_std1.data) std0 = Variable(std1.data) kl = log_std1 - log_std0 + (std0.pow(2) + (mean0 - mean1).pow(2)) / ( 2.0 * std1.pow(2)) - 0.5 return kl.sum(1, keepdim=True) loss = trpo_step(self.policyNet, get_loss, get_kl, maxKL, damping) self.avgLoss += loss self.trainIt += 1