class Learner: def __init__(self, learner_id, config, dev, shared_state, shared_queue): self.action_size = config['action_space'] self.obs_size = config['obs_space'] self.shared_queue = shared_queue self.shared_state = shared_state self.dev = dev self.id = learner_id self.burn_in_length = config['burn_in_length'] # 40-80 self.learning_length = config['learning_length'] self.sequence_length = self.burn_in_length + self.learning_length self.n_step = config['n_step'] self.sequence = [] self.recurrent_state = [] self.priority = [] self.td_loss = deque(maxlen=self.learning_length) self.gamma = config['gamma'] # self.actor_parameter_update_interval = config['actor_parameter_update_interval'] self.actor = ActorNet(dev, config).to(self.dev) self.target_actor = ActorNet(dev, config).to(self.dev) self.critic = CriticNet(dev, config).to(self.dev) self.target_critic = CriticNet(dev, config).to(self.dev) self.actor.load_state_dict(self.shared_state["actor"].state_dict()) self.target_actor.load_state_dict( self.shared_state["target_actor"].state_dict()) self.critic.load_state_dict(self.shared_state["critic"].state_dict()) self.target_critic.load_state_dict( self.shared_state["target_critic"].state_dict()) # self.actor.load_state_dict(self.shared_state["actor"]) # self.target_actor.load_state_dict(self.shared_state["target_actor"]) # self.critic.load_state_dict(self.shared_state["critic"]) # self.target_critic.load_state_dict(self.shared_state["target_critic"]) self.learner_actor_rate = config['learner_actor_rate'] self.num_actors = learner_id self.n_actions = 1 self.max_frame = config['learner_max_frame'] self.memory_sequence_size = config['memory_sequence_size'] self.batch_size = config['batch_size'] self.memory = LearnerReplayMemory(self.memory_sequence_size, config, dev) self.model_path = './' # self.memory_path = './memory_data/' # self.model_save_interval = 10 # 50 self.learner_parameter_update_interval = config[ 'learner_parameter_update_interval'] # 50 self.target_update_inverval = config['target_update_interval'] # 100 self.gamma = config['gamma'] self.actor_lr = config['actor_lr'] self.critic_lr = config['critic_lr'] self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.actor_lr) self.actor_criterion = nn.MSELoss() self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=self.critic_lr) self.critic_criterion = nn.MSELoss() def __del__(self): self.shared_queue.close() self.shared_state.close() # self.save_model() def save_model(self): model_dict = { 'actor': self.actor.state_dict(), 'target_actor': self.target_actor.state_dict(), 'critic': self.critic.state_dict(), 'target_critic': self.target_critic.state_dict() } torch.save(model_dict, self.model_path + 'model.pt') def update_target_model(self): self.target_actor.load_state_dict(self.actor.state_dict()) self.target_critic.load_state_dict(self.critic.state_dict()) def run(self): time_check(-1) while self.memory.size() < self.batch_size: self.memory.append(self.shared_queue.get(block=True)) # self.memory.append(self.shared_queue.get()) print('\rmem size: ', self.memory.size(), end='\r') time_check(1) count_mem = 0 frame = 0 win_v = vis.line(Y=torch.Tensor([0]), opts=dict(title='V_loss')) win_p = vis.line(Y=torch.Tensor([0]), opts=dict(title='P_loss')) while frame < self.max_frame: # sleep(0.0001) # if self.shared_queue.qsize()==0 and count_mem <0: # self.memory.append(self.shared_queue.get(block=True)) # # for i in range(self.shared_queue.qsize()): # self.memory.append(self.shared_queue.get(block=False)) # count_mem += self.learner_actor_rate # print('waiting shared q {}/{}'.format(self.memory.size(),self.batch_size)) # self.shared_state['frame'][self.id]=frame # while self.shared_state['sleep'][self.id] : # sleep(0.5) # if self.shared_queue.qsize()==0 and count_mem <0: # self.memory.append(self.shared_queue.get(block=True)) # self.memory.append(self.shared_queue.get()) # for i in range(self.shared_queue.qsize()): ## global_buf.append(self.shared_queue.get()) # self.memory.append(self.shared_queue.get()) # count_mem += self.learner_actor_rate if self.shared_queue.qsize() != 0: self.memory.append(self.shared_queue.get(block=True)) frame += 1 count_mem -= 1 episode_index, sequence_index, obs_seq, action_seq, reward_seq, gamma_seq, a_state, ta_state, c_state, tc_state = self.memory.sample( ) self.actor.set_state(a_state[0], a_state[1]) self.target_actor.set_state(ta_state[0], ta_state[1]) self.critic.set_state(c_state[0], c_state[1]) self.target_critic.set_state(tc_state[0], tc_state[1]) ### burn-in step ### _ = [self.actor(obs_seq[i]) for i in range(self.burn_in_length)] _ = [ self.critic(obs_seq[i], action_seq[i]) for i in range(self.burn_in_length) ] _ = [ self.target_actor(obs_seq[i]) for i in range(self.burn_in_length + self.n_step) ] _ = [ self.target_critic(obs_seq[i], action_seq[i]) for i in range(self.burn_in_length + self.n_step) ] ### learning steps ### # update ciritic q_value = torch.zeros(self.learning_length * self.batch_size, self.n_actions) target_q_value = torch.zeros( self.learning_length * self.batch_size, self.n_actions) for i in range(self.learning_length): obs_i = self.burn_in_length + i next_obs_i = self.burn_in_length + i + self.n_step q_value[i * self.batch_size:(i + 1) * self.batch_size] = self.critic(obs_seq[obs_i], action_seq[obs_i]) with torch.no_grad(): next_q_value = self.target_critic( obs_seq[next_obs_i], self.target_actor(obs_seq[next_obs_i])) target_q_val = reward_seq[obs_i] + ( gamma_seq[next_obs_i]**self.n_step) * next_q_value # target_q_val = invertical_vf(target_q_val) target_q_value[i * self.batch_size:(i + 1) * self.batch_size] = target_q_val critic_loss = self.actor_criterion(q_value, target_q_value.detach()) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # update actor self.actor.reset_state() self.critic.reset_state() actor_loss = torch.zeros(self.learning_length * self.batch_size, self.n_actions).to(self.dev) for i in range(self.learning_length): obs_i = i + self.burn_in_length action = self.actor(obs_seq[obs_i]) actor_loss[i * self.batch_size:(i + 1) * self.batch_size] = -self.critic( obs_seq[obs_i], self.actor(obs_seq[obs_i])) actor_loss = actor_loss.mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # update target networks if frame % self.target_update_inverval == 0: self.update_target_model() print('#', frame, 'critic_loss:', critic_loss.item(), ' actor_loss:', actor_loss.item(), ' count:', count_mem) win_p = vis.line(X=torch.Tensor([frame]), Y=torch.Tensor([actor_loss.item()]), win=win_p, update='append') win_v = vis.line(X=torch.Tensor([frame]), Y=torch.Tensor([critic_loss.item()]), win=win_v, update='append') # calc priority average_td_loss = ((q_value - target_q_value)**2).detach().to( self.dev) # average_td_loss = np.mean(((q_value - target_q_value)**2).detach().cpu().numpy() , axis = 1) for i in range(len(episode_index)): td = average_td_loss[i:-1:self.batch_size] self.memory.priority[episode_index[i]][ sequence_index[i]] = calc_priority(td).cpu().view(1, -1) self.memory.total_priority[episode_index[i]] = torch.cat( self.memory.priority[episode_index[i]]).sum(0).view(1, -1) # self.memory.priority[episode_index[i]][sequence_index[i]] = calc_priority(td) # self.memory.total_priority[episode_index[i]] = sum(self.memory.priority[episode_index[i]]) # if frame % self.model_save_interval == 0: # self.save_model() if frame % self.learner_parameter_update_interval == 0: # print('learner update ') # [self.shared_state["actor"][k] = v.cpu() for k,v in self.actor.state_dict().item() ] # [self.shared_state["target_actor"][k] = v.cpu() for k,v in self.target_actor.state_dict().item() ] # [self.shared_state["critic"][k] = v.cpu() for k,v in self.critic.state_dict().item() ] # [self.shared_state["target_critic"][k] = v.cpu() for k,v in self.target_critic.state_dict().item() ] # # for k,v in self.actor.state_dict().items(): # self.shared_state["actor"][k] = v.cpu() # for k,v in self.target_actor.state_dict().items(): # self.shared_state["target_actor"][k] = v.cpu() # for k,v in self.critic.state_dict().items(): # self.shared_state["critic"][k] = v.cpu() # for k,v in self.target_critic.state_dict().items(): # self.shared_state["target_critic"][k] = v.cpu() # self.shared_state["actor"] = self.actor.state_dict() # self.shared_state["target_actor"] = self.target_actor.state_dict() # self.shared_state["critic"] = self.critic.state_dict() # self.shared_state["target_critic"] = self.target_critic.state_dict() self.shared_state["actor"].load_state_dict( self.actor.state_dict()) self.shared_state["critic"].load_state_dict( self.critic.state_dict()) self.shared_state["target_actor"].load_state_dict( self.target_actor.state_dict()) self.shared_state["target_critic"].load_state_dict( self.target_critic.state_dict()) for i in range(self.num_actors): self.shared_state["update"][i] = True print('learner_update', self.actor.policy_l0.weight.data[0][0]) self.actor.reset_state() self.target_actor.reset_state() self.critic.reset_state() self.target_critic.reset_state()
class Learner: def __init__(self, n_actors): self.env = suite.load(domain_name="walker", task_name="run") self.n_actions = self.env.action_spec().shape[0] self.obs_size = get_obs(self.env.reset().observation).shape[1] self.n_actors = n_actors self.burn_in_length = 20 # 40-80 self.learning_length = 40 self.sequence_length = self.burn_in_length + self.learning_length self.n_step = 5 self.memory_sequence_size = 5000000 self.batch_size = 32 self.memory = LearnerReplayMemory( memory_sequence_size=self.memory_sequence_size, batch_size=self.batch_size) self.model_path = './model_data/' self.memory_path = './memory_data/' self.actor = ActorNet(self.obs_size, self.n_actions, 0).cuda() self.target_actor = deepcopy(self.actor).eval() self.critic = CriticNet(self.obs_size, self.n_actions, 0).cuda() self.target_critic = deepcopy(self.critic).eval() self.model_save_interval = 50 # 50 self.memory_update_interval = 50 # 50 self.target_update_inverval = 500 # 100 self.gamma = 0.997 self.actor_lr = 1e-4 self.critic_lr = 1e-3 self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.actor_lr) self.actor_criterion = nn.MSELoss() self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=self.critic_lr) self.critic_criterion = nn.MSELoss() self.save_model() def save_model(self): model_dict = { 'actor': self.actor.state_dict(), 'target_actor': self.target_actor.state_dict(), 'critic': self.critic.state_dict(), 'target_critic': self.target_critic.state_dict() } torch.save(model_dict, self.model_path + 'model.pt') def update_target_model(self): self.target_actor.load_state_dict(self.actor.state_dict()) self.target_critic.load_state_dict(self.critic.state_dict()) def run(self): # memory not enough while self.memory.sequence_counter < self.batch_size * 100: for i in range(self.n_actors): is_memory = os.path.isfile(self.memory_path + '/memory{}.pt'.format(i)) if is_memory: self.memory.load(i) sleep(0.1) print('learner memory sequence size:', self.memory.sequence_counter) step = 0 while True: if step % 100 == 0: print('learning step:', step) start = time() step += 1 episode_index, sequence_index, obs_seq, action_seq, reward_seq, terminal_seq, a_state, ta_state, c_state, tc_state = self.memory.sample( ) self.actor.set_state(a_state[0], a_state[1]) self.target_actor.set_state(ta_state[0], ta_state[1]) self.critic.set_state(c_state[0], c_state[1]) self.target_critic.set_state(tc_state[0], tc_state[1]) ### burn-in step ### _ = [self.actor(obs) for obs in obs_seq[0:self.burn_in_length]] _ = [ self.critic(obs, action) for obs, action in zip(obs_seq[0:self.burn_in_length], action_seq[0:self.burn_in_length]) ] _ = [ self.target_actor(obs) for obs in obs_seq[0:self.burn_in_length + self.n_step] ] _ = [ self.target_critic(obs, action) for obs, action in zip( obs_seq[0:self.burn_in_length + self.n_step], action_seq[0:self.burn_in_length + self.n_step]) ] ### learning steps ### # update ciritic q_value = torch.zeros(self.learning_length * self.batch_size, self.n_actions).cuda() target_q_value = torch.zeros( self.learning_length * self.batch_size, self.n_actions).cuda() for i in range(self.learning_length): obs_i = self.burn_in_length + i next_obs_i = self.burn_in_length + i + self.n_step q_value[i * self.batch_size:(i + 1) * self.batch_size] = self.critic(obs_seq[obs_i], action_seq[obs_i]) next_q_value = self.target_critic( obs_seq[next_obs_i], self.target_actor(obs_seq[next_obs_i])) target_q_val = reward_seq[obs_i] + ( self.gamma**self.n_step) * ( 1. - terminal_seq[next_obs_i - 1]) * next_q_value target_q_val = invertical_vf(target_q_val) target_q_value[i * self.batch_size:(i + 1) * self.batch_size] = target_q_val critic_loss = self.actor_criterion(q_value, target_q_value.detach()) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # update actor self.actor.reset_state() self.critic.reset_state() actor_loss = torch.zeros(self.learning_length * self.batch_size, self.n_actions).cuda() for i in range(self.learning_length): obs_i = i + self.burn_in_length action = self.actor(obs_seq[obs_i]) actor_loss[i * self.batch_size:(i + 1) * self.batch_size] = -self.critic( obs_seq[obs_i], self.actor(obs_seq[obs_i])) actor_loss = actor_loss.mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # update target networks if step % self.target_update_inverval == 0: self.update_target_model() # calc priority average_td_loss = np.mean( (q_value - target_q_value).detach().cpu().numpy()**2., axis=1) for i in range(len(episode_index)): td = average_td_loss[i:-1:self.batch_size] self.memory.priority[episode_index[i]][ sequence_index[i]] = calc_priority(td) self.memory.total_priority[episode_index[i]] = sum( self.memory.priority[episode_index[i]]) if step % self.model_save_interval == 0: self.save_model() if step % self.memory_update_interval == 0: for i in range(self.n_actors): is_memory = os.path.isfile(self.memory_path + '/memory{}.pt'.format(i)) if is_memory: self.memory.load(i) sleep(0.1) self.actor.reset_state() self.target_actor.reset_state() self.critic.reset_state() self.target_critic.reset_state()
class Actor: def __init__(self, actor_id, config, dev, shared_state, shared_queue, eps): # self.env = suite.load(domain_name="walker", task_name="run") # self.action_size = self.env.action_spec().shape[0] # self.obs_size = get_obs(self.env.reset().observation).shape[1] self.env = env_cover(config, dev) self.num_env = config['num_envs'] self.shared_queue = shared_queue self.shared_state = shared_state self.dev = dev self.actor_id = actor_id self.burn_in_length = config['burn_in_length'] # 40-80 self.learning_length = config['learning_length'] self.sequence_length = self.burn_in_length + self.learning_length self.n_step = config['n_step'] self.sequence = [] self.recurrent_state = [] self.priority = [] self.td_loss = deque(maxlen=self.learning_length) # self.memory_sequence_size = 1000 # self.memory = ReplayMemory(memory_sequence_size=self.memory_sequence_size) # self.memory_save_interval = 3 self.max_frame = config['actor_max_frame'] self.gamma = config['gamma'] # self.actor_parameter_update_interval = config['actor_parameter_update_interval'] self.max_shared_q_size = config['max_shared_q_size'] self.model_path = './' self.memory_path = './' self.actor = ActorNet(dev, config).to(self.dev) self.target_actor = ActorNet(dev, config).to(self.dev) self.critic = CriticNet(dev, config).to(self.dev) self.target_critic = CriticNet(dev, config).to(self.dev) self.actor.load_state_dict(self.shared_state["actor"].state_dict()) self.target_actor.load_state_dict( self.shared_state["target_actor"].state_dict()) self.critic.load_state_dict(self.shared_state["critic"].state_dict()) self.target_critic.load_state_dict( self.shared_state["target_critic"].state_dict()) # self.actor.load_state_dict(self.shared_state["actor"]) # self.target_actor.load_state_dict(self.shared_state["target_actor"]) # self.critic.load_state_dict(self.shared_state["critic"]) # self.target_critic.load_state_dict(self.shared_state["target_critic"]) self.action_argmax = config['action_argmax'] # self.load_model() self.epsilon = eps def __del__(self): self.env.close() def PrePro(self, obs): return obs # return torch.from_numpy(obs).detach().float().reshape((1,self.obs_size)).to(self.dev) def save_memory(self): model_dict = { 'sequence': self.sequence, 'recurrent_state': self.recurrent_state, 'priority': self.priority, } torch.save(model_dict, self.memory_path + 'memory.pt') # with open('outfile', 'wb') as fp: # pickle.dump(itemlist, fp) # # with open ('outfile', 'rb') as fp: # itemlist = pickle.load(fp) def load_model(self): if os.path.isfile(self.model_path + 'model.pt'): while True: try: # TODO: Delete # self.actor = ActorNet(self.obs_size, self.action_size, self.actor_id%2+1).cuda().eval() # self.target_actor = deepcopy(self.actor) # self.critic = CriticNet(self.obs_size, self.action_size, self.actor_id%2+1).cuda().eval() # self.target_critic = deepcopy(self.critic) #model_dict = torch.load(self.model_path + 'model.pt', map_location={'cuda:0':'cuda:{}'.format(self.actor_id%2+1)}) print('waiting model.pt') model_dict = torch.load(self.model_path + 'model.pt') self.actor.load_state_dict(model_dict['actor']) self.target_actor.load_state_dict( model_dict['target_actor']) self.critic.load_state_dict(model_dict['critic']) self.target_critic.load_state_dict( model_dict['target_critic']) self.actor.to(self.dev) self.target_actor.to(self.dev) self.critic.to(self.dev) self.target_critic.to(self.dev) except: sleep(np.random.rand() * 5 + 2) else: break def calc_nstep_reward(self): for i in range(len(self.sequence) - self.n_step): self.sequence[i][2] = sum([ self.sequence[i + j][2] * (self.sequence[i + j][3]**j) for j in range(self.n_step) ]) def calc_priorities(self): with torch.no_grad(): self.actor.reset_state() self.critic.reset_state() self.target_actor.reset_state() self.target_critic.reset_state() # self.td_loss = deque(maxlen=self.learning_length) self.td_loss = [] self.priority = [] # 이부분은 target 넷을 nstep 만큼 진행 해놓는것. # for i in range(self.n_step): # next_obs = self.sequence[i][0] # next_action = self.target_actor(self.PrePro(next_obs)).to(self.dev) # next_q_value = self.target_critic(self.PrePro(next_obs), next_action) # n 스텝 진행 하면서 Q 벨류 예측. seq[시퀀스][0:staet ,1:action ,2:reward,3:term->gamma] for i in range(len(self.sequence) - self.n_step): # obs = torch.from_numpy(self.sequence[i][0]).unsqueeze(0) # obs = self.sequence[i][0] # # action = self.sequence[i][1].unsqueeze(0) # next_obs = self.sequence[i + self.n_step][0] # # action = self.sequence[i][1] ## action = torch.Tensor(self.sequence[i][1]).view(1,-1).to(self.dev) # # next_obs = torch.from_numpy(self.sequence[i + self.n_step][0]).unsqueeze(0) # next_action = self.target_actor(self.PrePro(next_obs)).to(self.dev) # # q_value = self.critic(self.PrePro(obs), action) # q_value = q_value.gather(1,action.view(1,-1)) # reward = self.sequence[i][2] # gamma = self.sequence[i + self.n_step - 1][3] # next_q_value = self.target_critic(self.PrePro(next_obs),next_action).max(1)[0] # # if i >= self.burn_in_length: # target_q_value = (reward + (gamma ** self.n_step)) * next_q_value ## target_q_value = invertical_vf(target_q_value) # self.td_loss.append(((q_value - target_q_value)**2)) # if len(self.td_loss) > self.learning_length: # self.td_loss.pop(0) # if i >= self.sequence_length: # self.priority.append(calc_priority(self.td_loss)) self.priority.append(torch.Tensor([0])) def run(self): # sleep(random.random()*1) frame = 0 # if self.actor_id%3 == 0: win_r = vis.line(Y=torch.Tensor([0]), opts=dict(title='reward' + str(self.epsilon))) reward_sum = 0 while frame < self.max_frame: # self.shared_state['frame'][self.actor_id]=frame # while self.shared_state['sleep'][self.actor_id] : # sleep(0.5) st, rt, dt = self.env.reset() self.actor.reset_state() self.critic.reset_state() self.target_actor.reset_state() self.target_critic.reset_state() self.sequence = [] self.recurrent_state = [] self.priority = [] self.td_loss.clear() # if self.actor_id%3 == 0: win_r = vis.line(X=torch.Tensor([frame]), Y=torch.Tensor([reward_sum]), win=win_r, update='append') qmin = 9999 qmax = -9999 pmin = 9999 pmax = -9999 reward_sum = 0 count_step = 0 sleep(0.01) while sum(dt) != self.num_env: frame += 1 # get recurrent state action = self.actor(self.PrePro(st)) Qv = self.critic(self.PrePro(st), action) qmax = max(qmax, Qv.max()) qmin = min(qmin, Qv.min()) pmax = max(pmax, action.max()) pmin = min(pmin, action.min()) # noise = torch.normal(mean=torch.zeros([self.num_env,1]),std=torch.ones([self.num_env,1])).to(self.dev) # action = action.detach().item() + np.random.normal(0, self.epsilon, (self.action_size)) # action = np.clip(action, -1, 1) action = Qv.argmax().view(1, -1) if self.epsilon > random.random(): action = torch.LongTensor([random.randint(0, 1)]).view(1, -1) # m = torch.distributions.MultivariateNormal(torch.zeros([1,1]), torch.eye(1)) # action = action + m.sample().to(self.dev)*self.epsilon ## action = action.clamp(min=0,max=1) # # if self.action_argmax: # act = action.argmax(1).cpu().numpy().item() # else: # act = action.cpu().numpy() # action = (action+noise*self.epsilon).clamp(min=-1,max=1) st_1, rt, dt = self.env.step(int(action.item())) reward_sum += rt count_step += 1 gamma = torch.ones([self.num_env, 1]).to( self.dev) * self.gamma * (1 - dt) # gamma = self.gamma if not dt else 0. self.sequence.append([st, action, rt, gamma]) st = st_1 # self.recurrent_state.append([torch.cat([actor_hx, actor_cx]), torch.cat([target_actor_hx, target_actor_cx]), # torch.cat([critic_hx, critic_cx]), torch.cat([target_critic_hx, target_critic_cx])]) # if True: if self.shared_state["update"][self.actor_id]: self.actor.load_state_dict( self.shared_state["actor"].state_dict()) self.target_actor.load_state_dict( self.shared_state["target_actor"].state_dict()) self.critic.load_state_dict( self.shared_state["critic"].state_dict()) self.target_critic.load_state_dict( self.shared_state["target_critic"].state_dict()) self.shared_state["update"][self.actor_id] = False # print('actor_update',self.actor.policy_l0.weight.data[0][0]) # self.load_model() if len(self.sequence) >= self.sequence_length: # self.sequence.extend([(st, action, 0., 0.) for i in range(self.n_step)]) # st, rt, dt = self.env.end_dummy() # self.sequence.extend([[st,action, rt, dt] for i in range(self.n_step)]) st, rt, dt = self.env.end_dummy() self.sequence.extend([[st, action, rt, dt] for i in range(self.n_step)]) # self.calc_nstep_reward() # self.calc_priorities() for i in range(len(self.sequence)): for j in range(4): self.sequence[i][j] = self.sequence[i][j].cpu() # for i in range(len(self.recurrent_state)): # for j in range(4): # self.recurrent_state[i][j] = self.recurrent_state[i][j].cpu() for i in range(len(self.priority)): self.priority[i] = self.priority[i].cpu() blocking = True if self.shared_queue.qsize( ) > self.max_shared_q_size else False self.shared_queue.put([self.sequence], block=blocking) # if self.actor_id == 0: print('\r#', self.actor_id, 'frame:', frame, 'step:', count_step, 'reward: {:.3f}'.format(reward_sum.item()), 'qmin,max :{:.3f},{:.3f}, pminmax : {:.3f},{:.3f}'.format( qmin, qmax, pmin, pmax), end='\r')
class Actor: def __init__(self, actor_id): self.env = suite.load(domain_name="walker", task_name="run") self.action_size = self.env.action_spec().shape[0] self.obs_size = get_obs(self.env.reset().observation).shape[1] self.actor_id = actor_id self.burn_in_length = 20 # 40-80 self.learning_length = 40 self.sequence_length = self.burn_in_length + self.learning_length self.n_step = 5 self.sequence = [] self.recurrent_state = [] self.priority = [] self.td_loss = deque(maxlen=self.learning_length) self.memory_sequence_size = 1000 self.memory = ReplayMemory( memory_sequence_size=self.memory_sequence_size) self.memory_save_interval = 3 self.gamma = 0.997 self.actor_parameter_update_interval = 500 self.model_path = './model_data/' self.actor = ActorNet(self.obs_size, self.action_size, cuda_id=self.actor_id % 2 + 1).cuda(self.actor_id % 2 + 1).eval() self.target_actor = deepcopy(self.actor) self.critic = CriticNet(self.obs_size, self.action_size, cuda_id=self.actor_id % 2 + 1).cuda(self.actor_id % 2 + 1).eval() self.target_critic = deepcopy(self.critic) self.load_model() self.epsilon = 1 self.last_obs = None def load_model(self): if os.path.isfile(self.model_path + 'model.pt'): while True: try: # TODO: Delete self.actor = ActorNet(self.obs_size, self.action_size, self.actor_id % 2 + 1).cuda().eval() self.target_actor = deepcopy(self.actor) self.critic = CriticNet(self.obs_size, self.action_size, self.actor_id % 2 + 1).cuda().eval() self.target_critic = deepcopy(self.critic) #model_dict = torch.load(self.model_path + 'model.pt', map_location={'cuda:0':'cuda:{}'.format(self.actor_id%2+1)}) model_dict = torch.load(self.model_path + 'model.pt') self.actor.load_state_dict(model_dict['actor']) self.target_actor.load_state_dict( model_dict['target_actor']) self.critic.load_state_dict(model_dict['critic']) self.target_critic.load_state_dict( model_dict['target_critic']) self.actor.cuda(self.actor_id % 2 + 1) self.target_actor.cuda(self.actor_id % 2 + 1) self.critic.cuda(self.actor_id % 2 + 1) self.target_critic.cuda(self.actor_id % 2 + 1) except: sleep(np.random.rand() * 5 + 2) else: break def calc_nstep_reward(self): for i in range(len(self.sequence) - self.n_step): self.sequence[i][2][0] = sum([ self.sequence[i + j][2][0] * (self.gamma**j) for j in range(self.n_step) ]) def calc_priorities(self): self.actor.reset_state() self.critic.reset_state() self.target_actor.reset_state() self.target_critic.reset_state() self.td_loss = deque(maxlen=self.learning_length) self.priority = [] for i in range(self.n_step): next_obs = torch.from_numpy( self.sequence[i][0]).cuda(self.actor_id % 2 + 1).unsqueeze(0) next_action = self.target_actor(next_obs) next_q_value = self.target_critic( next_obs, next_action).detach().cpu().numpy() for i in range(len(self.sequence) - self.n_step): obs = torch.from_numpy( self.sequence[i][0]).cuda(self.actor_id % 2 + 1).unsqueeze(0) action = torch.from_numpy( self.sequence[i][1]).cuda(self.actor_id % 2 + 1).unsqueeze(0) next_obs = torch.from_numpy( self.sequence[i + self.n_step][0]).cuda(self.actor_id % 2 + 1).unsqueeze(0) next_action = self.target_actor(next_obs) q_value = self.critic(obs, action).detach().cpu().numpy() reward = self.sequence[i][2][0] terminal = self.sequence[i + self.n_step - 1][3][0] next_q_value = self.target_critic( next_obs, next_action).detach().cpu().numpy() if i >= self.burn_in_length: target_q_value = (reward + (self.gamma**self.n_step) * (1. - terminal) * next_q_value) target_q_value = invertical_vf( torch.tensor(target_q_value).cuda( self.actor_id % 2 + 1)).detach().cpu().numpy() self.td_loss.append((q_value - target_q_value).mean()) if i >= self.sequence_length: self.priority.append( calc_priority( np.array(list(self.td_loss), dtype=np.float32)**2.)) def run(self): episode = 0 step = 0 reward_sum = 0 while True: time_step = self.env.reset() obs = get_obs(time_step.observation) self.actor.reset_state() self.critic.reset_state() self.target_actor.reset_state() self.target_critic.reset_state() self.sequence = [] self.recurrent_state = [] self.priority = [] self.td_loss.clear() last_obs = None episode_step = 0 done = False if self.actor_id == 0 and episode != 0: print('episode:', episode, 'step:', step, 'reward:', reward_sum) episode += 1 reward_sum = 0 while not time_step.last(): # get recurrent state actor_hx, actor_cx = self.actor.get_state() target_actor_hx, target_actor_cx = self.target_actor.get_state( ) critic_hx, critic_cx = self.critic.get_state() target_critic_hx, target_critic_cx = self.target_critic.get_state( ) action = self.actor( torch.from_numpy(obs).cuda(self.actor_id % 2 + 1)) target_action = self.target_actor( torch.from_numpy(obs).cuda(self.actor_id % 2 + 1)) _ = self.critic( torch.from_numpy(obs).cuda(self.actor_id % 2 + 1), action) _ = self.target_critic( torch.from_numpy(obs).cuda(self.actor_id % 2 + 1), target_action) action = action.detach().cpu().numpy()[0] action += np.random.normal(0, 0.3, (self.action_size)) action = np.clip(action, -1, 1) reward = 0. sleep(0.01) for i in range(4): time_step = self.env.step(action) next_obs = get_obs(time_step.observation) reward += time_step.reward if time_step.last(): break reward_sum += reward step += 1 episode_step += 1 terminal = 1. if time_step.last() else 0. self.sequence.append((obs[0], action, [reward], [terminal])) obs = next_obs.copy() self.recurrent_state.append( [[actor_hx[0], actor_cx[0]], [target_actor_hx[0], target_actor_cx[0]], [critic_hx[0], critic_cx[0]], [target_critic_hx[0], target_critic_cx[0]]]) if step % self.actor_parameter_update_interval == 0: self.load_model() if len(self.sequence) >= self.sequence_length: self.sequence.extend([(np.zeros((self.obs_size), dtype=np.float32), np.zeros((self.action_size), dtype=np.float32), [0.], [1.]) for i in range(self.n_step)]) self.calc_nstep_reward() self.calc_priorities() self.memory.add(self.sequence, self.recurrent_state, self.priority) if len(self.memory.memory) > self.memory_save_interval: self.memory.save(self.actor_id)