Beispiel #1
0
class Actor:
    def __init__(self, actor_id,config,dev,shared_state,shared_queue,eps):
#        self.env = suite.load(domain_name="walker", task_name="run")
#        self.action_size = self.env.action_spec().shape[0]
#        self.obs_size = get_obs(self.env.reset().observation).shape[1]

        self.env = env_cover(config,dev)
        self.num_env = config['num_envs']
        self.shared_queue = shared_queue
        self.shared_state = shared_state
        self.dev = dev 

        self.actor_id = actor_id
        self.burn_in_length = config['burn_in_length'] # 40-80
        self.learning_length = config['learning_length']
        self.sequence_length = self.burn_in_length + self.learning_length
        self.n_step = config['n_step']
        self.sequence = []
        self.recurrent_state = []
        self.priority = []
        self.td_loss = deque(maxlen=self.learning_length)
#        self.memory_sequence_size = 1000
#        self.memory = ReplayMemory(memory_sequence_size=self.memory_sequence_size)
#        self.memory_save_interval = 3
        self.max_frame = config['actor_max_frame']
        self.gamma = config['gamma']
#        self.actor_parameter_update_interval = config['actor_parameter_update_interval']
        self.max_shared_q_size=config['max_shared_q_size']
        
        self.model_path = './'
        self.memory_path = './'
        
        self.actor = ActorNet(dev,config).to(self.dev)
        self.target_actor = ActorNet(dev,config).to(self.dev)
        self.critic = CriticNet(dev,config).to(self.dev)
        self.target_critic = CriticNet(dev,config).to(self.dev)
        
        self.actor.load_state_dict(self.shared_state["actor"].state_dict())
        self.target_actor.load_state_dict(self.shared_state["target_actor"].state_dict())
        self.critic.load_state_dict(self.shared_state["critic"].state_dict())
        self.target_critic.load_state_dict(self.shared_state["target_critic"].state_dict())

#        self.actor.load_state_dict(self.shared_state["actor"])
#        self.target_actor.load_state_dict(self.shared_state["target_actor"])
#        self.critic.load_state_dict(self.shared_state["critic"])
#        self.target_critic.load_state_dict(self.shared_state["target_critic"])
        self.action_argmax = config['action_argmax']
        
#        self.load_model()
        self.epsilon = eps 
    def __del__(self):
        self.env.close()
        
    def PrePro(self,obs):
        return obs
#        return torch.from_numpy(obs).detach().float().reshape((1,self.obs_size)).to(self.dev)
   
    def save_memory(self):
        
        model_dict = {'sequence': self.sequence,
                      'recurrent_state': self.recurrent_state,
                      'priority': self.priority,
                      }
        
        torch.save(model_dict, self.memory_path + 'memory.pt')
    
    
#    with open('outfile', 'wb') as fp:
#    pickle.dump(itemlist, fp)
#    
#    with open ('outfile', 'rb') as fp:
#    itemlist = pickle.load(fp)
    
    
    def load_model(self):
        if os.path.isfile(self.model_path + 'model.pt'):
            while True:
                try:
                    # TODO: Delete
#                    self.actor = ActorNet(self.obs_size, self.action_size, self.actor_id%2+1).cuda().eval()
#                    self.target_actor = deepcopy(self.actor)
#                    self.critic = CriticNet(self.obs_size, self.action_size, self.actor_id%2+1).cuda().eval()
#                    self.target_critic = deepcopy(self.critic)
                    #model_dict = torch.load(self.model_path + 'model.pt', map_location={'cuda:0':'cuda:{}'.format(self.actor_id%2+1)})
                    print('waiting  model.pt')
                    model_dict = torch.load(self.model_path + 'model.pt')
                    self.actor.load_state_dict(model_dict['actor'])
                    self.target_actor.load_state_dict(model_dict['target_actor'])
                    self.critic.load_state_dict(model_dict['critic'])
                    self.target_critic.load_state_dict(model_dict['target_critic'])
                    self.actor.to(self.dev)
                    self.target_actor.to(self.dev)
                    self.critic.to(self.dev)
                    self.target_critic.to(self.dev)
                    
                except:
                    sleep(np.random.rand() * 5 + 2)
                else:
                    break

    def calc_nstep_reward(self):
        for i in range(len(self.sequence) - self.n_step):
            self.sequence[i][2] = sum([ self.sequence[i+j][2] * (self.sequence[i+j][3] ** j) for j in range(self.n_step)] )
            

    def calc_priorities(self):
        with torch.no_grad():
            self.actor.reset_state()
            self.critic.reset_state()
            self.target_actor.reset_state()
            self.target_critic.reset_state()
#            self.td_loss = deque(maxlen=self.learning_length)
            self.td_loss = []
            self.priority = []
    
    #       이부분은  target 넷을  nstep 만큼 진행 해놓는것.
            for i in range(self.n_step):
                next_obs = self.sequence[i][0]
                next_action = self.target_actor(self.PrePro(next_obs)).to(self.dev)
                next_q_value = self.target_critic(self.PrePro(next_obs), next_action)
    
    #       n 스텝 진행 하면서 Q 벨류 예측.   seq[시퀀스][0:staet ,1:action ,2:reward,3:term->gamma]
            for i in range(len(self.sequence) - self.n_step):
    #            obs = torch.from_numpy(self.sequence[i][0]).unsqueeze(0)
                obs = self.sequence[i][0]
    #            action = self.sequence[i][1].unsqueeze(0)
                next_obs = self.sequence[i + self.n_step][0]

                action = self.sequence[i][1]
#                action = torch.Tensor(self.sequence[i][1]).view(1,-1).to(self.dev)
    #            next_obs = torch.from_numpy(self.sequence[i + self.n_step][0]).unsqueeze(0)
                next_action = self.target_actor(self.PrePro(next_obs)).to(self.dev)
    
                q_value = self.critic(self.PrePro(obs), action)
                reward = self.sequence[i][2]
                gamma = self.sequence[i + self.n_step - 1][3]
                next_q_value = self.target_critic(self.PrePro(next_obs), next_action)
                
                if i >= self.burn_in_length:
                    target_q_value = (reward + (gamma ** self.n_step)) * next_q_value
#                    target_q_value = invertical_vf(target_q_value)
                    self.td_loss.append(((q_value - target_q_value)**2))
                    if len(self.td_loss) > self.learning_length:
                        self.td_loss.pop(0)

                if i >= self.sequence_length:
                    self.priority.append(calc_priority(self.td_loss))
            
            
            
    
    def run(self):
#        sleep(random.random()*1)
        frame = 0
        if self.actor_id%3 == 0:
            win_r = vis.line(Y=torch.Tensor([0]), opts=dict(title ='reward'+str(self.epsilon)))
        reward_sum = 0
        
        
        while frame  < self.max_frame:
#            self.shared_state['frame'][self.actor_id]=frame
#            while self.shared_state['sleep'][self.actor_id] :
#                sleep(0.5)
            
            st, rt, dt  = self.env.reset()
            
            self.actor.reset_state()
            self.critic.reset_state()
            self.target_actor.reset_state()
            self.target_critic.reset_state()
            
            self.sequence = []
            self.recurrent_state = []
            self.priority = []
            
            self.td_loss.clear()
            if self.actor_id%3 == 0:
                win_r = vis.line(X=torch.Tensor([frame]), Y=torch.Tensor([reward_sum]), win= win_r , update ='append')
            
            reward_sum = 0
            count_step = 0     
            sleep(0.01)
            while sum(dt)!=self.num_env:
                
                frame+=1
                # get recurrent state
                actor_hx, actor_cx = self.actor.get_state()
                target_actor_hx, target_actor_cx = self.target_actor.get_state()
                critic_hx, critic_cx = self.critic.get_state()
                target_critic_hx, target_critic_cx = self.target_critic.get_state()
                
                action = self.actor(self.PrePro(st))
                target_action = self.target_actor(self.PrePro(st))
                _ = self.critic(self.PrePro(st), action)
                _ = self.target_critic(self.PrePro(st), target_action)

                noise = torch.normal(mean=torch.zeros([self.num_env,1]),std=torch.ones([self.num_env,1])).to(self.dev)
#                action = action.detach().item() +  np.random.normal(0, self.epsilon, (self.action_size))
#                action = np.clip(action, -1, 1)
                
                if self.action_argmax:
                    act = action.argmax(1).cpu().numpy().item()
                else:
                    action = action.cpu().numpy()
                if random.random()> self.epsilon:
                    act = random.randint(0,1)
#                action = (action+noise*self.epsilon).clamp(min=-1,max=1)

                st_1, rt, dt = self.env.step(act)
    
                reward_sum += rt
                count_step += 1
                gamma = torch.ones([self.num_env,1]).to(self.dev)*self.gamma*(1-dt)
#                gamma = self.gamma if not dt else 0.
                self.sequence.append([st, action, rt, gamma])
                st = st_1

                self.recurrent_state.append([torch.cat([actor_hx, actor_cx]), torch.cat([target_actor_hx, target_actor_cx]), 
                                                torch.cat([critic_hx, critic_cx]), torch.cat([target_critic_hx, target_critic_cx])])

#                if True:
                if self.shared_state["update"][self.actor_id]:
                    
                    
                    self.actor.load_state_dict(self.shared_state["actor"].state_dict())
                    self.target_actor.load_state_dict(self.shared_state["target_actor"].state_dict())
                    self.critic.load_state_dict(self.shared_state["critic"].state_dict())
                    self.target_critic.load_state_dict(self.shared_state["target_critic"].state_dict())
                    self.shared_state["update"][self.actor_id]=False
                    print('actor_update',self.actor.policy_l0.weight.data[0][0])
#                    self.load_model()


            if len(self.sequence) >= self.sequence_length:
                #self.sequence.extend([(st, action, 0., 0.) for i in range(self.n_step)])
                st, rt, dt = self.env.end_dummy()
                self.sequence.extend([[st,action, rt, dt] for i in range(self.n_step)])

                self.calc_nstep_reward()
                self.calc_priorities()
                
                
                for i in range(len(self.sequence)):
                    for j in range(4):
                        self.sequence[i][j] = self.sequence[i][j].cpu()
                for i in range(len(self.recurrent_state)):
                    for j in range(4):
                        self.recurrent_state[i][j] = self.recurrent_state[i][j].cpu()
                for i in range(len(self.priority)):
                    self.priority[i] = self.priority[i].cpu()
                blocking = True if self.shared_queue.qsize()>self.max_shared_q_size else False
                self.shared_queue.put([self.sequence, self.recurrent_state, self.priority],block=blocking)
                
#            if self.actor_id == 0:
            print('#',self.actor_id,'frame:', frame,'step:', count_step, 'reward:', reward_sum)
Beispiel #2
0
class Actor:
    def __init__(self, actor_id):
        self.env = suite.load(domain_name="walker", task_name="run")
        self.action_size = self.env.action_spec().shape[0]
        self.obs_size = get_obs(self.env.reset().observation).shape[1]

        self.actor_id = actor_id
        self.burn_in_length = 20  # 40-80
        self.learning_length = 40
        self.sequence_length = self.burn_in_length + self.learning_length
        self.n_step = 5
        self.sequence = []
        self.recurrent_state = []
        self.priority = []
        self.td_loss = deque(maxlen=self.learning_length)
        self.memory_sequence_size = 1000
        self.memory = ReplayMemory(
            memory_sequence_size=self.memory_sequence_size)
        self.memory_save_interval = 3

        self.gamma = 0.997
        self.actor_parameter_update_interval = 500
        self.model_path = './model_data/'
        self.actor = ActorNet(self.obs_size,
                              self.action_size,
                              cuda_id=self.actor_id % 2 +
                              1).cuda(self.actor_id % 2 + 1).eval()
        self.target_actor = deepcopy(self.actor)
        self.critic = CriticNet(self.obs_size,
                                self.action_size,
                                cuda_id=self.actor_id % 2 +
                                1).cuda(self.actor_id % 2 + 1).eval()
        self.target_critic = deepcopy(self.critic)
        self.load_model()
        self.epsilon = 1
        self.last_obs = None

    def load_model(self):
        if os.path.isfile(self.model_path + 'model.pt'):
            while True:
                try:
                    # TODO: Delete
                    self.actor = ActorNet(self.obs_size, self.action_size,
                                          self.actor_id % 2 + 1).cuda().eval()
                    self.target_actor = deepcopy(self.actor)
                    self.critic = CriticNet(self.obs_size, self.action_size,
                                            self.actor_id % 2 +
                                            1).cuda().eval()
                    self.target_critic = deepcopy(self.critic)
                    #model_dict = torch.load(self.model_path + 'model.pt', map_location={'cuda:0':'cuda:{}'.format(self.actor_id%2+1)})
                    model_dict = torch.load(self.model_path + 'model.pt')
                    self.actor.load_state_dict(model_dict['actor'])
                    self.target_actor.load_state_dict(
                        model_dict['target_actor'])
                    self.critic.load_state_dict(model_dict['critic'])
                    self.target_critic.load_state_dict(
                        model_dict['target_critic'])
                    self.actor.cuda(self.actor_id % 2 + 1)
                    self.target_actor.cuda(self.actor_id % 2 + 1)
                    self.critic.cuda(self.actor_id % 2 + 1)
                    self.target_critic.cuda(self.actor_id % 2 + 1)
                except:
                    sleep(np.random.rand() * 5 + 2)
                else:
                    break

    def calc_nstep_reward(self):
        for i in range(len(self.sequence) - self.n_step):
            self.sequence[i][2][0] = sum([
                self.sequence[i + j][2][0] * (self.gamma**j)
                for j in range(self.n_step)
            ])

    def calc_priorities(self):
        self.actor.reset_state()
        self.critic.reset_state()
        self.target_actor.reset_state()
        self.target_critic.reset_state()
        self.td_loss = deque(maxlen=self.learning_length)
        self.priority = []

        for i in range(self.n_step):
            next_obs = torch.from_numpy(
                self.sequence[i][0]).cuda(self.actor_id % 2 + 1).unsqueeze(0)
            next_action = self.target_actor(next_obs)
            next_q_value = self.target_critic(
                next_obs, next_action).detach().cpu().numpy()

        for i in range(len(self.sequence) - self.n_step):
            obs = torch.from_numpy(
                self.sequence[i][0]).cuda(self.actor_id % 2 + 1).unsqueeze(0)
            action = torch.from_numpy(
                self.sequence[i][1]).cuda(self.actor_id % 2 + 1).unsqueeze(0)
            next_obs = torch.from_numpy(
                self.sequence[i + self.n_step][0]).cuda(self.actor_id % 2 +
                                                        1).unsqueeze(0)
            next_action = self.target_actor(next_obs)

            q_value = self.critic(obs, action).detach().cpu().numpy()
            reward = self.sequence[i][2][0]
            terminal = self.sequence[i + self.n_step - 1][3][0]
            next_q_value = self.target_critic(
                next_obs, next_action).detach().cpu().numpy()

            if i >= self.burn_in_length:
                target_q_value = (reward + (self.gamma**self.n_step) *
                                  (1. - terminal) * next_q_value)
                target_q_value = invertical_vf(
                    torch.tensor(target_q_value).cuda(
                        self.actor_id % 2 + 1)).detach().cpu().numpy()
                self.td_loss.append((q_value - target_q_value).mean())
            if i >= self.sequence_length:
                self.priority.append(
                    calc_priority(
                        np.array(list(self.td_loss), dtype=np.float32)**2.))

    def run(self):
        episode = 0
        step = 0
        reward_sum = 0

        while True:
            time_step = self.env.reset()
            obs = get_obs(time_step.observation)
            self.actor.reset_state()
            self.critic.reset_state()
            self.target_actor.reset_state()
            self.target_critic.reset_state()
            self.sequence = []
            self.recurrent_state = []
            self.priority = []
            self.td_loss.clear()
            last_obs = None
            episode_step = 0
            done = False
            if self.actor_id == 0 and episode != 0:
                print('episode:', episode, 'step:', step, 'reward:',
                      reward_sum)
            episode += 1
            reward_sum = 0

            while not time_step.last():

                # get recurrent state
                actor_hx, actor_cx = self.actor.get_state()
                target_actor_hx, target_actor_cx = self.target_actor.get_state(
                )
                critic_hx, critic_cx = self.critic.get_state()
                target_critic_hx, target_critic_cx = self.target_critic.get_state(
                )

                action = self.actor(
                    torch.from_numpy(obs).cuda(self.actor_id % 2 + 1))
                target_action = self.target_actor(
                    torch.from_numpy(obs).cuda(self.actor_id % 2 + 1))
                _ = self.critic(
                    torch.from_numpy(obs).cuda(self.actor_id % 2 + 1), action)
                _ = self.target_critic(
                    torch.from_numpy(obs).cuda(self.actor_id % 2 + 1),
                    target_action)

                action = action.detach().cpu().numpy()[0]
                action += np.random.normal(0, 0.3, (self.action_size))
                action = np.clip(action, -1, 1)

                reward = 0.
                sleep(0.01)
                for i in range(4):
                    time_step = self.env.step(action)
                    next_obs = get_obs(time_step.observation)
                    reward += time_step.reward
                    if time_step.last():
                        break

                reward_sum += reward
                step += 1
                episode_step += 1
                terminal = 1. if time_step.last() else 0.
                self.sequence.append((obs[0], action, [reward], [terminal]))
                obs = next_obs.copy()

                self.recurrent_state.append(
                    [[actor_hx[0], actor_cx[0]],
                     [target_actor_hx[0], target_actor_cx[0]],
                     [critic_hx[0], critic_cx[0]],
                     [target_critic_hx[0], target_critic_cx[0]]])

                if step % self.actor_parameter_update_interval == 0:
                    self.load_model()

            if len(self.sequence) >= self.sequence_length:
                self.sequence.extend([(np.zeros((self.obs_size),
                                                dtype=np.float32),
                                       np.zeros((self.action_size),
                                                dtype=np.float32), [0.], [1.])
                                      for i in range(self.n_step)])
                self.calc_nstep_reward()
                self.calc_priorities()
                self.memory.add(self.sequence, self.recurrent_state,
                                self.priority)

            if len(self.memory.memory) > self.memory_save_interval:
                self.memory.save(self.actor_id)