def __init__(self, name, n_o, n_a): self.agent_name = name #name of the agent: 'predator' or 'prey' self.dtype = torch.float self.obs_dim = n_o #observation space dimensions self.act_dim = n_a #action space dimensions self.GAMMA = 0.99 #discount factor self.ALPHA = 1e-3 #learning rate self.BATCH_SIZE = 16 #number of samples in batch self.BUFFER_SIZE = 10000 self.TAU = 0.001 self.buffer = Memory(self.BUFFER_SIZE) self.noise = OU_noise(dt=0.05) self.online_actor = Actor(self.obs_dim, self.act_dim) self.online_critic = Critic(self.obs_dim, 1) self.target_actor = Actor(self.obs_dim, self.act_dim) self.target_critic = Critic(self.obs_dim, 1) '''Copy Online Parameters to target Parameters''' self.target_actor.policy.load_state_dict( self.online_actor.policy.state_dict()) self.target_critic.value_func.load_state_dict( self.online_critic.value_func.state_dict()) self.optim = torch.optim.Adam( self.online_critic.value_func.parameters(), lr=self.ALPHA) self.state = None #temp holding of variables before being pushed to replay buffer. 2D tensor [[state]] self.act = None #temp holding of variables before being pushed to replay buffer. 2D tensore [[action]]
def build(self): self.policy_net1 = DQN2D(84, 84, self.pars).to(self.device) self.target_net1 = DQN2D(84, 84, self.pars).to(self.device) self.target_net1.load_state_dict(self.policy_net1.state_dict()) self.target_net1.eval() self.policy_net2 = DQN2D(84, 84, self.pars).to(self.device) self.target_net2 = DQN2D(84, 84, self.pars).to(self.device) self.target_net2.load_state_dict(self.policy_net2.state_dict()) self.target_net2.eval() self.optimizer1 = optim.SGD(self.policy_net1.parameters(), lr=self.pars['lr'], momentum=self.pars['momentum']) # self.optimizer2 = optim.SGD(self.policy_net2.parameters(), lr=self.pars['lr'], momentum=self.pars['momentum']) # #self.optimizer1 = optim.Adam(self.policy_net1.parameters()) #self.optimizer2 = optim.Adam(self.policy_net2.parameters()) self.memory2 = ReplayMemory(10000) self.memory1 = ReplayMemory(10000) if self.pars['ppe'] == '1': self.memory1 = Memory(10000) self.memory2 = Memory(10000)
def build(self): self.policy_net = DQN(97, self.pars, rec=self.pars['rec'] == 1).to(self.device) self.target_net = DQN(97, self.pars, rec=self.pars['rec'] == 1).to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() if self.pars['momentum'] > 0: self.optimizer = optim.SGD(self.policy_net.parameters(), lr=self.pars['lr'], momentum=self.pars['momentum']) # else: self.optimizer = optim.Adam(self.policy_net.parameters()) self.memory = ReplayMemory(10000) if 'ppe' in self.pars: self.memory = Memory(10000) if self.pars['load'] is not None: self.load(self.pars['load']) self.target_net.load_state_dict(self.policy_net.state_dict()) print('loaded')
def build(self): self.policy_net = DQN2D(84,84, self.pars, rec=self.pars['rec']==1).to(self.device) self.q_net = DQN2D(84,84, self.pars).to(self.device) self.target_net = DQN2D(84,84, self.pars).to(self.device) self.target_net.load_state_dict(self.q_net.state_dict()) self.target_net.eval() if self.pars['momentum']>0: self.optimizer = optim.SGD( self.q_net.parameters(), lr=self.pars['lr'], momentum=self.pars['momentum'])# self.policy_optimizer = optim.SGD( self.policy_net.parameters(), lr=self.pars['lr'], momentum=self.pars['momentum'])# else: self.optimizer = optim.Adam(self.q_net.parameters()) self.policy_optimizer = optim.Adam(self.policy_net.parameters()) self.memory = ReplayMemory(10000) if self.pars['ppe'] == '1': self.memory = Memory(10000) self.eps_threshold = 0.01
def collect_batch(env: gym.Env, actor: torch.nn.Module, buffer: Memory, batch_size: int, device: torch.device): while len(buffer) < batch_size: obs = env.reset() done = False obs = torch.tensor(obs, dtype=torch.float32, device=device) prev_idx = buffer.add_obs(obs) while not done: obs = torch.unsqueeze(obs, dim=0) action, action_logprobs = actor.act(obs) action = action.cpu().numpy()[0] obs, rew, done, _ = env.step(action) obs = torch.tensor(obs, dtype=torch.float32, device=device) next_idx = buffer.add_obs(obs) buffer.add_timestep(prev_idx, next_idx, action, action_logprobs, rew, done) prev_idx = next_idx buffer.end_rollout()
class Agent: def __init__(self, name, pars, nrenvs=1, job=None, experiment=None): self.job = job self.name = name self.experiment = experiment self.pars = pars self.envs = [GameEnv(pars['subhid']) for i in range(nrenvs)] for env in self.envs: env.reset() self.BATCH_SIZE = pars['bs'] self.GAMMA = 0.999 self.rnnB = 3 self.EPS_START = 0.9 self.EPS_END = 0.05 self.alpha = pars['alpha'] self.EPS_DECAY = 200 self.TARGET_UPDATE = pars['tg'] self.nrf = pars['nrf'] self.capmem = 0 self.prob = 0.5 self.idC = 0 if pars['comm'] == '0': self.prob = -1 if pars['comm'] == '1': self.idC = 1 self.nopr = False self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.build() if pars['results_path']: result_path = pars['results_path'] + name if not os.path.exists(result_path): os.makedirs(result_path) result_path = result_path + '/results_' + str(0) + '.csv' self.result_out = open(result_path, 'w') csv_meta = '#' + json.dumps(pars) + '\n' self.result_out.write(csv_meta) self.writer = csv.DictWriter(self.result_out, fieldnames=['episode', 'reward']) self.writer.writeheader() self.steps_done = 0 self.num_episodes = pars['numep'] self.lr = pars['lr'] self.momentum = pars['momentum'] self.maxR = 0 self.dru = DRU(0.2, comm_narrow=True, hard=False, device=self.device) def build(self): self.policy_net = DQN(97, self.pars, rec=self.pars['rec'] == 1).to(self.device) self.target_net = DQN(97, self.pars, rec=self.pars['rec'] == 1).to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() if self.pars['momentum'] > 0: self.optimizer = optim.SGD(self.policy_net.parameters(), lr=self.pars['lr'], momentum=self.pars['momentum']) # else: self.optimizer = optim.Adam(self.policy_net.parameters()) self.memory = ReplayMemory(10000) if 'ppe' in self.pars: self.memory = Memory(10000) if self.pars['load'] is not None: self.load(self.pars['load']) self.target_net.load_state_dict(self.policy_net.state_dict()) print('loaded') def getComm(self, mes, policy_net, state1_batch): return policy_net( state1_batch, 1, mes)[self.idC] if np.random.rand() < self.prob else mes def optimize_model(self, policy_net, target_net, memory, optimizer): if self.pars['ppe'] != '1' and len(memory) < self.BATCH_SIZE: return if self.pars['ppe'] == '1' and self.capmem < self.BATCH_SIZE: return if self.pars['ppe'] == '1': #state1, action1, next_state1, reward1, state2 tree_idx, batch, ISWeights_mb = memory.sample(self.BATCH_SIZE) non_final_next_states = torch.cat([i[2] for i in batch]) state_batch = torch.cat([i[0] for i in batch]) action_batch = torch.cat([i[1] for i in batch]) reward_batch = torch.cat([i[3] for i in batch]) state1_batch = torch.cat([i[4] for i in batch]) else: transitions = memory.sample(self.BATCH_SIZE) batch = Transition(*zip(*transitions)) non_final_next_states = torch.cat(batch.next_state) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) state1_batch = torch.cat(batch.agent_index) mes = torch.tensor([[0, 0, 0, 0] for i in range(self.BATCH_SIZE)], device=self.device) if self.pars['att'] == 1: _, comm, att = policy_net(state1_batch, 1, mes) if np.random.rand() < 0.0001: print(att.cpu().data.numpy()[:10, 0]) else: comm = self.getComm(mes, policy_net, state1_batch) if self.pars['dru'] > 0: comm = self.dru.forward(comm, True) if self.pars['comm'] == '2': comm = comm.detach() q, _ = policy_net(state_batch, 1, comm)[:2] state_action_values = q.gather(1, action_batch) next_state_values = target_net(non_final_next_states, 1, mes)[0].max(1)[0].detach() expected_state_action_values = (next_state_values * self.GAMMA) + reward_batch.float() loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) #loss = weighted_mse_loss(state_action_values, expected_state_action_values.unsqueeze(1), # torch.tensor(ISWeights_mb, device=self.device)) #print(torch.tensor(ISWeights_mb, device=self.device).size()) if self.pars['ppe'] == '1': absolute_errors = (state_action_values - expected_state_action_values.unsqueeze(1) ).abs().cpu().data.numpy().reshape((-1)) memory.batch_update(tree_idx, absolute_errors) # Optimize the model if self.pars['att'] == 1: loss = loss + att.mean() * 0.001 if self.pars['commr'] == 1: comm1 = torch.flip(comm.detach(), [0]) q1 = policy_net(state_batch, 1, comm1)[0] #print(comm.detach(), comm1) #F.smooth_l1_loss(comm.detach().float(), comm1.float())# F.smooth_l1_loss(q1,q)# dc = 0.1 * ((comm.detach().float() - comm1.float())**2).mean( -1) #F.kl_div(comm.detach().float(), comm1.float()) dq = ((q1 - q)**2).mean(-1) #F.kl_div(q1, q) loss = loss + 0.01 * ((dc - dq)**2).mean() if np.random.rand() < 0.0005: print('difc', dc.cpu().data.numpy()[:10]) print('difq', dq.cpu().data.numpy()[:10]) optimizer.zero_grad() loss.backward() for param in policy_net.parameters(): if param.grad is not None: param.grad.data.clamp_(-1, 1) optimizer.step() def select_action(self, state, comm, policy_net): sample = random.random() eps_threshold = self.EPS_END + ( self.EPS_START - self.EPS_END) * math.exp( -1. * self.steps_done / self.EPS_DECAY) self.steps_done += 1 if sample > eps_threshold: with torch.no_grad(): return policy_net(state, 1, comm)[0].max(1)[1].view(1, 1) else: return torch.tensor([[random.randrange(4)]], device=self.device, dtype=torch.long) def getaction(self, state1, state2, test=False): mes = torch.tensor([[0, 0, 0, 0]], device=self.device) if test: comm2 = self.policy_net( state2, 0, mes)[self.idC].detach() if 0 < self.prob else mes comm1 = self.policy_net( state1, 0, mes)[self.idC].detach() if 0 < self.prob else mes if self.pars['dru'] > 0: comm1 = self.dru.forward(comm1, False) comm2 = self.dru.forward(comm2, False) action1 = self.policy_net(state1, 1, comm2)[0].max(1)[1].view(1, 1) action2 = self.policy_net(state2, 1, comm1)[0].max(1)[1].view(1, 1) else: comm2 = self.policy_net(state2, 0, mes)[ self.idC].detach() if np.random.rand() < self.prob else mes comm1 = self.policy_net(state1, 0, mes)[ self.idC].detach() if np.random.rand() < self.prob else mes if self.pars['dru'] > 0: comm1 = self.dru.forward(comm1, True) comm2 = self.dru.forward(comm2, True) action1 = self.select_action(state1, comm2, self.policy_net) action2 = self.select_action(state2, comm1, self.policy_net) return action1, action2, [comm1, comm2] def getStates(self, env): screen1 = env.render_env_1d(0) #.transpose((2, 0, 1)) screen2 = env.render_env_1d(1) #.transpose((2, 0, 1)) return torch.from_numpy(screen1).unsqueeze(0).to( self.device), torch.from_numpy(screen2).unsqueeze(0).to( self.device) def saveStates(self, state1, state2, action1, action2, next_state1, next_state2, reward1, reward2, env_id, t): self.capmem += 2 if self.pars['ppe'] != '1': self.memory.push(state2, action2, next_state2, reward2, state1) self.memory.push(state1, action1, next_state1, reward1, state2) else: self.rnnS1[-1].append(reward1) self.rnnS2[-1].append(reward2) if self.pars['rec'] == 1 and len( self.rnnS1) < self.rnnB: #always full steps self.rnnS1 = [self.rnnS1[0]] * (self.rnnB - len(self.rnnS1) + 1) + self.rnnS1 self.rnnS2 = [self.rnnS2[0]] * (self.rnnB - len(self.rnnS2) + 1) + self.rnnS2 #print(len(self.rnnS1),111) #print(len(self.rnnS1[-self.rnnB:])) self.memory.store([ state1, action1, next_state1, reward1, state2, self.rnnS1[-self.rnnB:] ]) self.memory.store([ state2, action2, next_state2, reward2, state1, self.rnnS2[-self.rnnB:] ]) def optimize(self): self.optimize_model(self.policy_net, self.target_net, self.memory, self.optimizer) def getDB(self): with open(self.pars['pretrain']) as f: data = json.load(f) #self.pars['pretrain'] = None self.nopr = True return data def pretrain(self): db = self.getDB() num_episodes = len(db) nr = len(db) - 1 totalN = len(db) sc = 5 for i_episode in range(num_episodes): if i_episode % 10: sc -= 1 sc = max(1, sc) if self.job is not None and self.job.stopEx: return for env_id, env in enumerate(self.envs[:1]): for i in db[nr:]: env.getFrom(i[0]) state1, state2 = self.getStates(env) env.getFrom(i[3]) next_state1, next_state2 = self.getStates(env) action1 = torch.tensor([[i[1][0]]], device=self.device) action2 = torch.tensor([[i[1][1]]], device=self.device) reward1 = torch.tensor([i[2][0] * 1.], device=self.device) reward2 = torch.tensor([i[2][1] * 1.], device=self.device) self.saveStates(state1, state2, action1, action2, next_state1, next_state2, reward1, reward2, env_id) for t in range((totalN - nr) * sc): self.bufs = [[] for i in range(len(self.envs) * 2)] bs = 1 self.h2 = torch.zeros(1, bs, self.pars['en'], device=self.device) self.h1 = torch.zeros(1, bs, self.pars['en'], device=self.device) env.getFrom(db[nr][0]) self.buf1 = [] self.buf2 = [] state1, state2 = self.getStates(env) rt = 0 ac = [] start_time = time.time() buf1 = [] buf2 = [] ep1 = [] for t in range((totalN - nr)): action1, action2, _ = self.getaction(state1, state2) reward1, reward2 = env.move( action1.item(), action2.item()) #multi envs?? rt += reward1 + reward2 ac.append(str(action1.item())) reward1 = torch.tensor([ reward1 * self.alpha + reward2 * (1 - self.alpha) ], device=self.device) reward2 = torch.tensor([ reward2 * self.alpha + reward1 * (1 - self.alpha) ], device=self.device) next_state1, next_state2 = self.getStates(env) self.saveStates(state1, state2, action1, action2, next_state1, next_state2, reward1, reward2, env_id) state1 = next_state1 state2 = next_state2 self.optimize() self.updateTarget(i_episode, step=True) if i_episode % self.pars['show'] == 0: print('ep', i_episode, 'reward train', rt, 'time', time.time() - start_time, ','.join(ac[:20])) self.updateTarget(i_episode) nr -= 1 if nr < 0: nr = 0 def getInitState(self): return torch.zeros(1, 1, self.pars['en'], device=self.device) def train(self, num_episodes): if self.pars['pretrain'] is not None and not self.nopr: self.pretrain() for i_episode in range(num_episodes): if self.job is not None and self.job.stopEx: return self.bufs = [[] for i in range(len(self.envs) * 2)] bs = 1 self.h2 = torch.zeros(1, bs, self.pars['en'], device=self.device) self.h1 = torch.zeros(1, bs, self.pars['en'], device=self.device) for env_id, env in enumerate(self.envs): env.reset() self.buf1 = [] self.buf2 = [] state1, state2 = self.getStates(env) rt = 0 ac = [] start_time = time.time() buf1 = [] buf2 = [] ep1 = [] self.rnnS1 = [] self.rnnS2 = [] self.reset_hx() for t in range(self.pars['epsteps']): action1, action2, _ = self.getaction(state1, state2) reward1, reward2 = env.move(action1.item(), action2.item()) #multi envs?? rt += reward1 + reward2 ac.append(str(action1.item())) reward1 = torch.tensor( [reward1 * self.alpha + reward2 * (1 - self.alpha)], device=self.device) reward2 = torch.tensor( [reward2 * self.alpha + reward1 * (1 - self.alpha)], device=self.device) next_state1, next_state2 = self.getStates(env) self.saveStates(state1, state2, action1, action2, next_state1, next_state2, reward1, reward2, env_id, t == self.pars['epsteps'] - 1) state1 = next_state1 state2 = next_state2 self.optimize() self.updateTarget(i_episode, step=True) if i_episode % self.pars['show'] == 0: print('ep', i_episode, 'reward train', rt, 'time', time.time() - start_time, ','.join(ac[:20])) self.updateTarget(i_episode) def test(self, tries=3, log=True, i_episode=-1): rs = [] rt1 = 0 ep = [] #self.policy_net.eval() for i in range(tries): ep = [] rt1 = 0 self.envs[0].reset() bs = 1 self.h2 = torch.zeros(1, bs, self.pars['en'], device=self.device) self.h1 = torch.zeros(1, bs, self.pars['en'], device=self.device) self.reset_hx() for t in range(100): #sep function state1, state2 = self.getStates(self.envs[0]) action1, action2, r = self.getaction(state1, state2, test=True) comm1, comm2 = r reward1, reward2 = self.envs[0].move(action1.item(), action2.item()) ep.append([ self.envs[0].render_env(), [action1.item(), action2.item()], [reward1, reward2], [ comm1.cpu().data.numpy()[0].tolist(), comm2.cpu().data.numpy()[0].tolist() ], [comm1.max(1)[1].item(), comm2.max(1)[1].item()] ]) rt1 += reward1 + reward2 rs.append(rt1) rm = np.mean(rs) if log and i_episode > 0: if self.job is not None: self.job.log({'reward' + self.name: rm, 'ep': i_episode}) if self.experiment is not None: self.experiment.set_step(i_episode) self.experiment.log_metric("reward" + self.name, rm) save_episode_and_reward_to_csv(self.result_out, self.writer, i_episode, rt1, ep, self.name, self.pars) if rm > self.maxR: self.maxR = rm self.save() print('saved') print('reward test', rm, rs, 'com', comm1.cpu().data.numpy()[0].tolist()) #self.policy_net.train() return rm def updateTarget(self, i_episode, step=False): #soft_update(self.target_net, self.policy_net, tau=0.01) if step: return if i_episode % self.TARGET_UPDATE == 0: self.target_net.load_state_dict(self.policy_net.state_dict()) def reset_hx(self): pass def save(self): torch.save(self.policy_net.state_dict(), self.pars['results_path'] + self.name + '/model') def load(self, PATH): #torch.cuda.is_available() self.policy_net.load_state_dict( torch.load( PATH, map_location='cuda' if torch.cuda.is_available() else 'cpu')) def perturb_learning_rate(self, i_episode, nolast=True): if nolast: new_lr_factor = 10**np.random.normal(scale=1.0) new_momentum_delta = np.random.normal(scale=0.1) self.EPS_DECAY += np.random.normal(scale=50.0) if self.EPS_DECAY < 50: self.EPS_DECAY = 50 if self.prob >= 0: self.prob += np.random.normal(scale=0.05) - 0.025 self.prob = min(max(0, self.prob), 1) for param_group in self.optimizer.param_groups: if nolast: param_group['lr'] *= new_lr_factor param_group['momentum'] += new_momentum_delta self.momentum = param_group['momentum'] self.lr = param_group['lr'] with open( os.path.join(self.pars['results_path'] + self.name, 'hyper-{}.json').format(i_episode), 'w') as outfile: json.dump( { 'lr': self.lr, 'momentum': self.momentum, 'eps_decay': self.EPS_DECAY, 'prob': self.prob, 'i_episode': i_episode }, outfile) def clone(self, agent): state_dict = agent.policy_net.state_dict() self.policy_net.load_state_dict(state_dict) state_dict = agent.optimizer.state_dict() self.optimizer.load_state_dict(state_dict) self.EPS_DECAY = agent.EPS_DECAY self.prob = agent.prob self.target_net.load_state_dict(self.policy_net.state_dict())
def __init__(self, args, env_params): self.s_dim = env_params['o_dim'] + env_params['g_dim'] self.a_dim = env_params['a_dim'] self.f_dim = args.f_dim self.action_bound = env_params['action_max'] self.max_timestep = env_params['max_timestep'] self.max_episode = args.max_episode self.evaluate_episode = args.evaluate_episode self.evaluate_interval = args.evaluate_interval self.log_interval = args.log_interval self.save_model_interval = args.save_model_interval self.save_model_start = args.save_model_start self.lr = args.lr self.lr_model = args.lr_model self.gamma = args.gamma self.batch_size = args.batch_size self.tau = args.tau self.eta = args.eta self.noise_eps = args.noise_eps self.device = torch.device(args.device) self.normalizer_s = Normalizer(size=self.s_dim, eps=1e-2, clip_range=1.) self.memory = Memory(size=args.memory_size, s_dim=self.s_dim, a_dim=self.a_dim) self.policy = Policy(s_dim=self.s_dim, a_dim=self.a_dim).to(self.device) self.policy_target = Policy(s_dim=self.s_dim, a_dim=self.a_dim).to(self.device) self.Q = QFunction(s_dim=self.s_dim, a_dim=self.a_dim).to(self.device) self.Q_target = QFunction(s_dim=self.s_dim, a_dim=self.a_dim).to(self.device) self.optimizer_p = optim.Adam(self.policy.parameters(), lr=self.lr) self.optimizer_q = optim.Adam(self.Q.parameters(), lr=self.lr) self.encoder = StateEncoder(s_dim=self.s_dim, f_dim=self.f_dim).to(self.device) self.EnvForward = ForwardModel(f_dim=self.f_dim, a_dim=self.a_dim).to(self.device) self.EnvInverse = InverseModel(f_dim=self.f_dim, a_dim=self.a_dim).to(self.device) self.optimizer_forward = optim.Adam( [{ 'params': self.EnvForward.parameters() }, { 'params': self.encoder.parameters() }], lr=self.lr_model) self.optimizer_inverse = optim.Adam( [{ 'params': self.EnvInverse.parameters() }, { 'params': self.encoder.parameters() }], lr=self.lr_model) self.hard_update() self.update_num = 0
class DDPG_Agent(): def __init__(self, args, env_params): self.s_dim = env_params['o_dim'] + env_params['g_dim'] self.a_dim = env_params['a_dim'] self.f_dim = args.f_dim self.action_bound = env_params['action_max'] self.max_timestep = env_params['max_timestep'] self.max_episode = args.max_episode self.evaluate_episode = args.evaluate_episode self.evaluate_interval = args.evaluate_interval self.log_interval = args.log_interval self.save_model_interval = args.save_model_interval self.save_model_start = args.save_model_start self.lr = args.lr self.lr_model = args.lr_model self.gamma = args.gamma self.batch_size = args.batch_size self.tau = args.tau self.eta = args.eta self.noise_eps = args.noise_eps self.device = torch.device(args.device) self.normalizer_s = Normalizer(size=self.s_dim, eps=1e-2, clip_range=1.) self.memory = Memory(size=args.memory_size, s_dim=self.s_dim, a_dim=self.a_dim) self.policy = Policy(s_dim=self.s_dim, a_dim=self.a_dim).to(self.device) self.policy_target = Policy(s_dim=self.s_dim, a_dim=self.a_dim).to(self.device) self.Q = QFunction(s_dim=self.s_dim, a_dim=self.a_dim).to(self.device) self.Q_target = QFunction(s_dim=self.s_dim, a_dim=self.a_dim).to(self.device) self.optimizer_p = optim.Adam(self.policy.parameters(), lr=self.lr) self.optimizer_q = optim.Adam(self.Q.parameters(), lr=self.lr) self.encoder = StateEncoder(s_dim=self.s_dim, f_dim=self.f_dim).to(self.device) self.EnvForward = ForwardModel(f_dim=self.f_dim, a_dim=self.a_dim).to(self.device) self.EnvInverse = InverseModel(f_dim=self.f_dim, a_dim=self.a_dim).to(self.device) self.optimizer_forward = optim.Adam( [{ 'params': self.EnvForward.parameters() }, { 'params': self.encoder.parameters() }], lr=self.lr_model) self.optimizer_inverse = optim.Adam( [{ 'params': self.EnvInverse.parameters() }, { 'params': self.encoder.parameters() }], lr=self.lr_model) self.hard_update() self.update_num = 0 def select_action(self, state, train_mode=True): s = self.normalize_input(state) s = torch.tensor(state, dtype=torch.float32).to(self.device) with torch.no_grad(): action = self.policy(s).cpu().numpy() if train_mode: action += np.random.randn( self.a_dim ) * self.noise_eps * self.action_bound #Gaussian Noise else: pass action = np.clip(action, a_min=-self.action_bound, a_max=self.action_bound) return action def get_intrisic_reward(self, s, a, s_): s, a, s_ = torch.from_numpy(s).to( self.device).float(), torch.from_numpy(a).to( self.device).float(), torch.from_numpy(s_).to( self.device).float() with torch.no_grad(): feature = self.encoder(s) next_feature_pred = self.EnvForward(feature, a) next_feature = self.encoder(s_) r_i = self.eta * torch.norm(next_feature_pred - next_feature) r_i = torch.clamp(r_i, min=-0.1, max=0.1) return r_i.cpu().detach().numpy() def train(self, env, logger=None): total_step = 0 loss_pi, loss_q, loss_forward, loss_inverse = 0., 0., 0., 0. for i_episode in range(self.max_episode): obs = env.reset() s = get_state(obs) cumulative_r = 0. for i_step in range(self.max_timestep): a = self.select_action(s) obs_, r_e, done, info = env.step(a) s_ = get_state(obs_) r_i = self.get_intrisic_reward(s, a, s_) r = r_e + r_i self.memory.store(s, a, r, s_) s = s_ if len(self.memory) > self.batch_size: loss_pi, loss_q, loss_forward, loss_inverse = self.learn() cumulative_r += r_e total_step += 1 print( 'i_episode: {} total step: {} cumulative reward: {:.4f} is_success: {} ' .format(i_episode, total_step, cumulative_r, info['is_success'])) if logger is not None and i_episode % self.log_interval == 0: logger.add_scalar('Indicator/cumulative reward', cumulative_r, i_episode) logger.add_scalar('Loss/pi_loss', loss_pi, i_episode) logger.add_scalar('Loss/q_loss', loss_q, i_episode) logger.add_scalar('Loss/forward_loss', loss_forward, i_episode) logger.add_scalar('Loss/inverse_loss', loss_inverse, i_episode) if i_episode % self.evaluate_interval == 0: success_rate = self.evaluate(env) if logger is not None: logger.add_scalar('Indicator/success rate', success_rate, i_episode) if i_episode > self.save_model_start and i_episode % self.save_model_interval == 0: self.save_model(remarks='{}_{}'.format(env.spec.id, i_episode)) def evaluate(self, env, render=False): success_count = 0 for i_episode in range(self.evaluate_episode): obs = env.reset() s = get_state(obs) for i_step in range(self.max_timestep): if render: env.render() a = self.select_action(s, train_mode=False) obs_, r_e, done, info = env.step(a) s_ = get_state(obs_) s = s_ success_count += info['is_success'] return success_count / self.evaluate_episode def learn(self): s, a, r, s_ = self.memory.sample_batch(batch_size=self.batch_size) self.normalizer_s.update(s) s, s_ = self.normalize_input(s, s_) s = torch.from_numpy(s).to(self.device) a = torch.from_numpy(a).to(self.device) r = torch.from_numpy(r).to(self.device).unsqueeze(dim=1) s_ = torch.from_numpy(s_).to(self.device) #update policy and Q with torch.no_grad(): a_next_tar = self.policy_target(s_) Q_next_tar = self.Q_target(s_, a_next_tar) loss_q_tar = r + self.gamma * Q_next_tar loss_q_pred = self.Q(s, a) loss_q = F.mse_loss(loss_q_pred, loss_q_tar.detach()) self.optimizer_q.zero_grad() loss_q.backward() self.optimizer_q.step() loss_p = -self.Q(s, self.policy(s)).mean() self.optimizer_p.zero_grad() loss_p.backward() self.optimizer_p.step() self.soft_update() #update env model and encoder feature = self.encoder(s) next_feature = self.encoder(s_) a_pred = self.EnvInverse(feature, next_feature) loss_inverse = F.mse_loss(a_pred, a) next_feature_pred = self.EnvForward(feature, a) with torch.no_grad(): next_feature_tar = self.encoder(s_) loss_forward = F.mse_loss(next_feature_pred, next_feature_tar.detach()) self.optimizer_forward.zero_grad() self.optimizer_inverse.zero_grad() loss_forward.backward(retain_graph=True) loss_inverse.backward() self.optimizer_forward.step() self.optimizer_inverse.step() self.update_num += 1 return loss_p.cpu().detach().numpy(), loss_q.cpu().detach().numpy( ), loss_forward.cpu().detach().numpy(), loss_inverse.cpu().detach( ).numpy() def update_normalizer(self, states): states = np.array(states, dtype=np.float32) self.normalizer_s.update(states) def hard_update(self): self.policy_target.load_state_dict(self.policy.state_dict()) self.Q_target.load_state_dict(self.Q.state_dict()) def soft_update(self): for param, param_target in zip(self.policy.parameters(), self.policy_target.parameters()): param_target.data.copy_(param.data * self.tau + param_target.data * (1 - self.tau)) for param, param_target in zip(self.Q.parameters(), self.Q_target.parameters()): param_target.data.copy_(param.data * self.tau + param_target.data * (1 - self.tau)) def normalize_input(self, s, s_=None): s = self.normalizer_s.normalize(s) if s_ is not None: s_ = self.normalizer_s.normalize(s_) return s, s_ else: return s def save_model(self, remarks): if not os.path.exists('pretrained_models_DDPG/'): os.mkdir('pretrained_models_DDPG/') path = 'pretrained_models_DDPG/{}.pt'.format(remarks) print('Saving model to {}'.format(path)) torch.save([ self.normalizer_s.mean, self.normalizer_s.std, self.policy.state_dict() ], path) def load_model(self, remark): print('Loading models with remark {}'.format(remark)) self.normalizer_s.mean, self.normalizer_s.std, policy_model = torch.load( 'pretrained_models_DDPG/{}.pt'.format(remark), map_location=lambda storage, loc: storage) self.policy.load_state_dict(policy_model)
class Agent(): def __init__(self, name, n_o, n_a): self.agent_name = name #name of the agent: 'predator' or 'prey' self.dtype = torch.float self.obs_dim = n_o #observation space dimensions self.act_dim = n_a #action space dimensions self.GAMMA = 0.99 #discount factor self.ALPHA = 1e-3 #learning rate self.BATCH_SIZE = 16 #number of samples in batch self.BUFFER_SIZE = 10000 self.TAU = 0.001 self.buffer = Memory(self.BUFFER_SIZE) self.noise = OU_noise(dt=0.05) self.online_actor = Actor(self.obs_dim, self.act_dim) self.online_critic = Critic(self.obs_dim, 1) self.target_actor = Actor(self.obs_dim, self.act_dim) self.target_critic = Critic(self.obs_dim, 1) '''Copy Online Parameters to target Parameters''' self.target_actor.policy.load_state_dict( self.online_actor.policy.state_dict()) self.target_critic.value_func.load_state_dict( self.online_critic.value_func.state_dict()) self.optim = torch.optim.Adam( self.online_critic.value_func.parameters(), lr=self.ALPHA) self.state = None #temp holding of variables before being pushed to replay buffer. 2D tensor [[state]] self.act = None #temp holding of variables before being pushed to replay buffer. 2D tensore [[action]] def agent_get_action(self, observation): assert type( observation ) is torch.Tensor, 'The state is not a tensor! It is of type: %s' % ( type(observation)) self.state = observation #get the action using online actor network and add noise for exploration action_d = self.online_actor.get_action(self.state) + self.noise.step() self.act = action_d.mean().unsqueeze(0).unsqueeze(1) #return action to world return action_d.detach().numpy() def agent_train(self, ns, r, done=False): #convert next state and reward to tensors #next_state_v = torch.tensor([next_state],dtype=dtype) #reward_v = torch.tensor([reward],dtype=dtype) #save the values in the replay buffer self.buffer.push(self.state, self.act, r, ns, done) #set the state to the next state to advance agent self.state = ns #if there are enough samples in replay buffer, perform network updates if len(self.buffer) >= self.BUFFER_SIZE: #get a mini batch from the replay buffer sample = self.buffer.sample(self.BATCH_SIZE) #make the data nice compressed_states, compressed_actions, compressed_next_states, compressed_rewards = utils.extract_data( sample) #critic network training #yt=r(st,at)+γ⋅Q(st+1,μ(st+1)) na_from_tactor_a = self.target_actor.get_action( compressed_next_states) na_from_tactor = na_from_tactor_a.mean(dim=1).unsqueeze(-1) v_from_tcritic = self.target_critic.get_state_value( compressed_next_states, na_from_tactor) #calculate yt=r(st,at)+γ⋅Q(st+1,μ(st+1)) target_v = compressed_rewards.unsqueeze( 1) + self.GAMMA * v_from_tcritic actual_v = self.online_critic.get_state_value( compressed_states, compressed_actions) loss = nn.MSELoss() output = loss(actual_v, target_v) self.optim.zero_grad() output.backward(retain_graph=True) self.optim.step() self.online_critic.value_func.zero_grad() for s, a in zip(compressed_states.split(1), compressed_actions.split(1)): online_v = self.online_critic.get_state_value(s, a) grad_wrt_a = torch.autograd.grad(online_v, (s, a)) action = self.online_actor.get_action(s) action.mean().backward(retain_graph=True) for param in self.online_actor.policy.parameters(): param.data += self.ALPHA * ( param.grad * grad_wrt_a[1].item()) / (self.BATCH_SIZE) self.online_actor.policy.zero_grad() self.online_critic.value_func.zero_grad() # #soft update for param_o, param_t in zip(self.online_actor.policy.parameters(), self.target_actor.policy.parameters()): param_t.data = param_o.data * self.TAU + param_t.data * ( 1 - self.TAU) for param_o, param_t in zip( self.online_critic.value_func.parameters(), self.target_critic.value_func.parameters()): param_t.data = param_o.data * self.TAU + param_t.data * ( 1 - self.TAU) self.online_actor.policy.zero_grad() self.target_actor.policy.zero_grad() self.online_critic.value_func.zero_grad() self.target_critic.value_func.zero_grad() torch.save(self.target_actor.policy.state_dict(), self.agent_name + 'target_actor_state_1.pt') torch.save(self.target_critic.value_func.state_dict(), self.agent_name + 'target_critic_state_1.pt')
def train_a2c(env_name=ENV_NAME, iterations=ITERATIONS, gamma=GAMMA, a_lr=A_LR, c_lr=C_LR, stats_freq=STATS_FREQ, batch_size=BATCH_SIZE, reward_done=REWARD_DONE, num_target_updates=NUM_TARGET_UPDATES, num_critic_updates=NUM_CRITIC_UPDATES, normalize_adv=NORMALIZE_ADV, filename=FILENAME, use_gpu=USE_GPU): if use_gpu and torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') env = gym.make(env_name) discrete = isinstance(env.action_space, gym.spaces.Discrete) ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] actor = Actor(ob_dim, ac_dim, discrete) critic = Critic(ob_dim) actor.to(device) critic.to(device) actor_optimizer = torch.optim.Adam(actor.parameters(), lr=a_lr) critic_optimizer = torch.optim.Adam(critic.parameters(), lr=c_lr) logger = StatsLogger() stats = [] time_list = [] for i in range(iterations): time = datetime.datetime.now() buffer = Memory() collect_batch(env, actor, buffer, batch_size, device) advantages = update_critic(critic, critic_optimizer, buffer, gamma=gamma, num_target_updates=num_target_updates, num_critic_updates=num_critic_updates, device=device) update_actor(actor, actor_optimizer, advantages, buffer, normalize_adv=normalize_adv) running_reward = logger.calc_running_reward(buffer) time_after = datetime.datetime.now() time_diff = time_after - time time_list.append(time_diff.total_seconds()) if not i % stats_freq: logger.print_running_reward(i) stats.append([i, logger.running_reward]) print('Average iteration is', sum(time_list) / len(time_list), 'seconds') time_list = [] if reward_done is not None and running_reward >= reward_done: logger.task_done(i) break if filename is not None: with open(filename + '_logs.pkl', 'wb') as f: pkl.dump(stats, f) if filename is not None: torch.save(actor.state_dict(), filename + '_model.pt') torch.save(critic.state_dict(), filename + '_critic_model.pt')