Beispiel #1
0
class Learner:
    def __init__(self, learner_id, config, dev, shared_state, shared_queue):

        self.action_size = config['action_space']
        self.obs_size = config['obs_space']

        self.shared_queue = shared_queue
        self.shared_state = shared_state

        self.dev = dev
        self.id = learner_id
        self.burn_in_length = config['burn_in_length']  # 40-80
        self.learning_length = config['learning_length']
        self.sequence_length = self.burn_in_length + self.learning_length
        self.n_step = config['n_step']
        self.sequence = []
        self.recurrent_state = []
        self.priority = []
        self.td_loss = deque(maxlen=self.learning_length)

        self.gamma = config['gamma']
        #        self.actor_parameter_update_interval = config['actor_parameter_update_interval']

        self.actor = ActorNet(dev, config).to(self.dev)
        self.target_actor = ActorNet(dev, config).to(self.dev)
        self.critic = CriticNet(dev, config).to(self.dev)
        self.target_critic = CriticNet(dev, config).to(self.dev)

        self.actor.load_state_dict(self.shared_state["actor"].state_dict())
        self.target_actor.load_state_dict(
            self.shared_state["target_actor"].state_dict())
        self.critic.load_state_dict(self.shared_state["critic"].state_dict())
        self.target_critic.load_state_dict(
            self.shared_state["target_critic"].state_dict())

        #        self.actor.load_state_dict(self.shared_state["actor"])
        #        self.target_actor.load_state_dict(self.shared_state["target_actor"])
        #        self.critic.load_state_dict(self.shared_state["critic"])
        #        self.target_critic.load_state_dict(self.shared_state["target_critic"])

        self.learner_actor_rate = config['learner_actor_rate']

        self.num_actors = learner_id
        self.n_actions = 1
        self.max_frame = config['learner_max_frame']

        self.memory_sequence_size = config['memory_sequence_size']
        self.batch_size = config['batch_size']
        self.memory = LearnerReplayMemory(self.memory_sequence_size, config,
                                          dev)

        self.model_path = './'
        #        self.memory_path = './memory_data/'
        #        self.model_save_interval = 10 # 50
        self.learner_parameter_update_interval = config[
            'learner_parameter_update_interval']  # 50
        self.target_update_inverval = config['target_update_interval']  # 100

        self.gamma = config['gamma']
        self.actor_lr = config['actor_lr']
        self.critic_lr = config['critic_lr']
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.actor_lr)
        self.actor_criterion = nn.MSELoss()
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=self.critic_lr)
        self.critic_criterion = nn.MSELoss()

    def __del__(self):
        self.shared_queue.close()
        self.shared_state.close()

#        self.save_model()

    def save_model(self):
        model_dict = {
            'actor': self.actor.state_dict(),
            'target_actor': self.target_actor.state_dict(),
            'critic': self.critic.state_dict(),
            'target_critic': self.target_critic.state_dict()
        }
        torch.save(model_dict, self.model_path + 'model.pt')

    def update_target_model(self):
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())

    def run(self):
        time_check(-1)
        while self.memory.size() < self.batch_size:
            self.memory.append(self.shared_queue.get(block=True))
            #            self.memory.append(self.shared_queue.get())
            print('\rmem size: ', self.memory.size(), end='\r')
        time_check(1)
        count_mem = 0
        frame = 0
        win_v = vis.line(Y=torch.Tensor([0]), opts=dict(title='V_loss'))
        win_p = vis.line(Y=torch.Tensor([0]), opts=dict(title='P_loss'))

        while frame < self.max_frame:
            #            sleep(0.0001)
            #            if self.shared_queue.qsize()==0 and count_mem <0:
            #                self.memory.append(self.shared_queue.get(block=True))
            #
            #            for i in range(self.shared_queue.qsize()):
            #                self.memory.append(self.shared_queue.get(block=False))
            #                count_mem += self.learner_actor_rate

            #            print('waiting  shared q {}/{}'.format(self.memory.size(),self.batch_size))

            #            self.shared_state['frame'][self.id]=frame
            #            while self.shared_state['sleep'][self.id] :
            #                sleep(0.5)
            #            if self.shared_queue.qsize()==0 and count_mem <0:
            #                self.memory.append(self.shared_queue.get(block=True))
            #                self.memory.append(self.shared_queue.get())

            #            for i in range(self.shared_queue.qsize()):
            ##                global_buf.append(self.shared_queue.get())
            #                self.memory.append(self.shared_queue.get())
            #                count_mem += self.learner_actor_rate

            if self.shared_queue.qsize() != 0:
                self.memory.append(self.shared_queue.get(block=True))

            frame += 1

            count_mem -= 1

            episode_index, sequence_index, obs_seq, action_seq, reward_seq, gamma_seq, a_state, ta_state, c_state, tc_state = self.memory.sample(
            )

            self.actor.set_state(a_state[0], a_state[1])
            self.target_actor.set_state(ta_state[0], ta_state[1])
            self.critic.set_state(c_state[0], c_state[1])
            self.target_critic.set_state(tc_state[0], tc_state[1])

            ### burn-in step ###
            _ = [self.actor(obs_seq[i]) for i in range(self.burn_in_length)]
            _ = [
                self.critic(obs_seq[i], action_seq[i])
                for i in range(self.burn_in_length)
            ]
            _ = [
                self.target_actor(obs_seq[i])
                for i in range(self.burn_in_length + self.n_step)
            ]
            _ = [
                self.target_critic(obs_seq[i], action_seq[i])
                for i in range(self.burn_in_length + self.n_step)
            ]
            ### learning steps ###

            # update ciritic
            q_value = torch.zeros(self.learning_length * self.batch_size,
                                  self.n_actions)

            target_q_value = torch.zeros(
                self.learning_length * self.batch_size, self.n_actions)
            for i in range(self.learning_length):
                obs_i = self.burn_in_length + i
                next_obs_i = self.burn_in_length + i + self.n_step
                q_value[i * self.batch_size:(i + 1) *
                        self.batch_size] = self.critic(obs_seq[obs_i],
                                                       action_seq[obs_i])
                with torch.no_grad():
                    next_q_value = self.target_critic(
                        obs_seq[next_obs_i],
                        self.target_actor(obs_seq[next_obs_i]))
                    target_q_val = reward_seq[obs_i] + (
                        gamma_seq[next_obs_i]**self.n_step) * next_q_value
                    #                target_q_val = invertical_vf(target_q_val)
                    target_q_value[i * self.batch_size:(i + 1) *
                                   self.batch_size] = target_q_val

            critic_loss = self.actor_criterion(q_value,
                                               target_q_value.detach())
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            # update actor
            self.actor.reset_state()
            self.critic.reset_state()
            actor_loss = torch.zeros(self.learning_length * self.batch_size,
                                     self.n_actions).to(self.dev)
            for i in range(self.learning_length):
                obs_i = i + self.burn_in_length
                action = self.actor(obs_seq[obs_i])
                actor_loss[i * self.batch_size:(i + 1) *
                           self.batch_size] = -self.critic(
                               obs_seq[obs_i], self.actor(obs_seq[obs_i]))
            actor_loss = actor_loss.mean()

            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # update target networks
            if frame % self.target_update_inverval == 0:
                self.update_target_model()

            print('#', frame, 'critic_loss:', critic_loss.item(),
                  '  actor_loss:', actor_loss.item(), '  count:', count_mem)
            win_p = vis.line(X=torch.Tensor([frame]),
                             Y=torch.Tensor([actor_loss.item()]),
                             win=win_p,
                             update='append')
            win_v = vis.line(X=torch.Tensor([frame]),
                             Y=torch.Tensor([critic_loss.item()]),
                             win=win_v,
                             update='append')

            # calc priority
            average_td_loss = ((q_value - target_q_value)**2).detach().to(
                self.dev)

            #            average_td_loss = np.mean(((q_value - target_q_value)**2).detach().cpu().numpy() , axis = 1)
            for i in range(len(episode_index)):
                td = average_td_loss[i:-1:self.batch_size]
                self.memory.priority[episode_index[i]][
                    sequence_index[i]] = calc_priority(td).cpu().view(1, -1)
                self.memory.total_priority[episode_index[i]] = torch.cat(
                    self.memory.priority[episode_index[i]]).sum(0).view(1, -1)


#                self.memory.priority[episode_index[i]][sequence_index[i]] = calc_priority(td)
#                self.memory.total_priority[episode_index[i]] = sum(self.memory.priority[episode_index[i]])

#            if frame % self.model_save_interval == 0:
#                self.save_model()

            if frame % self.learner_parameter_update_interval == 0:
                #                print('learner update ')

                #                [self.shared_state["actor"][k] = v.cpu() for k,v in self.actor.state_dict().item() ]
                #                [self.shared_state["target_actor"][k] = v.cpu() for k,v in self.target_actor.state_dict().item() ]
                #                [self.shared_state["critic"][k] = v.cpu() for k,v in self.critic.state_dict().item() ]
                #                [self.shared_state["target_critic"][k] = v.cpu() for k,v in self.target_critic.state_dict().item() ]

                #
                #                for k,v in self.actor.state_dict().items():
                #                    self.shared_state["actor"][k] = v.cpu()
                #                for k,v in self.target_actor.state_dict().items():
                #                    self.shared_state["target_actor"][k] = v.cpu()
                #                for k,v in self.critic.state_dict().items():
                #                    self.shared_state["critic"][k] = v.cpu()
                #                for k,v in self.target_critic.state_dict().items():
                #                    self.shared_state["target_critic"][k] = v.cpu()

                #                self.shared_state["actor"] = self.actor.state_dict()
                #                self.shared_state["target_actor"] = self.target_actor.state_dict()
                #                self.shared_state["critic"] = self.critic.state_dict()
                #                self.shared_state["target_critic"] = self.target_critic.state_dict()

                self.shared_state["actor"].load_state_dict(
                    self.actor.state_dict())
                self.shared_state["critic"].load_state_dict(
                    self.critic.state_dict())
                self.shared_state["target_actor"].load_state_dict(
                    self.target_actor.state_dict())
                self.shared_state["target_critic"].load_state_dict(
                    self.target_critic.state_dict())
                for i in range(self.num_actors):
                    self.shared_state["update"][i] = True

                print('learner_update', self.actor.policy_l0.weight.data[0][0])

            self.actor.reset_state()
            self.target_actor.reset_state()
            self.critic.reset_state()
            self.target_critic.reset_state()
class Learner:
    def __init__(self, n_actors):
        self.env = suite.load(domain_name="walker", task_name="run")
        self.n_actions = self.env.action_spec().shape[0]
        self.obs_size = get_obs(self.env.reset().observation).shape[1]

        self.n_actors = n_actors
        self.burn_in_length = 20  # 40-80
        self.learning_length = 40
        self.sequence_length = self.burn_in_length + self.learning_length
        self.n_step = 5
        self.memory_sequence_size = 5000000
        self.batch_size = 32
        self.memory = LearnerReplayMemory(
            memory_sequence_size=self.memory_sequence_size,
            batch_size=self.batch_size)

        self.model_path = './model_data/'
        self.memory_path = './memory_data/'
        self.actor = ActorNet(self.obs_size, self.n_actions, 0).cuda()
        self.target_actor = deepcopy(self.actor).eval()
        self.critic = CriticNet(self.obs_size, self.n_actions, 0).cuda()
        self.target_critic = deepcopy(self.critic).eval()
        self.model_save_interval = 50  # 50
        self.memory_update_interval = 50  # 50
        self.target_update_inverval = 500  # 100

        self.gamma = 0.997
        self.actor_lr = 1e-4
        self.critic_lr = 1e-3
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.actor_lr)
        self.actor_criterion = nn.MSELoss()
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=self.critic_lr)
        self.critic_criterion = nn.MSELoss()
        self.save_model()

    def save_model(self):
        model_dict = {
            'actor': self.actor.state_dict(),
            'target_actor': self.target_actor.state_dict(),
            'critic': self.critic.state_dict(),
            'target_critic': self.target_critic.state_dict()
        }
        torch.save(model_dict, self.model_path + 'model.pt')

    def update_target_model(self):
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())

    def run(self):
        # memory not enough
        while self.memory.sequence_counter < self.batch_size * 100:
            for i in range(self.n_actors):
                is_memory = os.path.isfile(self.memory_path +
                                           '/memory{}.pt'.format(i))
                if is_memory:
                    self.memory.load(i)
                sleep(0.1)
            print('learner memory sequence size:',
                  self.memory.sequence_counter)

        step = 0
        while True:
            if step % 100 == 0:
                print('learning step:', step)
            start = time()
            step += 1

            episode_index, sequence_index, obs_seq, action_seq, reward_seq, terminal_seq, a_state, ta_state, c_state, tc_state = self.memory.sample(
            )

            self.actor.set_state(a_state[0], a_state[1])
            self.target_actor.set_state(ta_state[0], ta_state[1])
            self.critic.set_state(c_state[0], c_state[1])
            self.target_critic.set_state(tc_state[0], tc_state[1])

            ### burn-in step ###
            _ = [self.actor(obs) for obs in obs_seq[0:self.burn_in_length]]
            _ = [
                self.critic(obs, action)
                for obs, action in zip(obs_seq[0:self.burn_in_length],
                                       action_seq[0:self.burn_in_length])
            ]
            _ = [
                self.target_actor(obs)
                for obs in obs_seq[0:self.burn_in_length + self.n_step]
            ]
            _ = [
                self.target_critic(obs, action) for obs, action in zip(
                    obs_seq[0:self.burn_in_length +
                            self.n_step], action_seq[0:self.burn_in_length +
                                                     self.n_step])
            ]

            ### learning steps ###

            # update ciritic
            q_value = torch.zeros(self.learning_length * self.batch_size,
                                  self.n_actions).cuda()
            target_q_value = torch.zeros(
                self.learning_length * self.batch_size, self.n_actions).cuda()
            for i in range(self.learning_length):
                obs_i = self.burn_in_length + i
                next_obs_i = self.burn_in_length + i + self.n_step
                q_value[i * self.batch_size:(i + 1) *
                        self.batch_size] = self.critic(obs_seq[obs_i],
                                                       action_seq[obs_i])
                next_q_value = self.target_critic(
                    obs_seq[next_obs_i],
                    self.target_actor(obs_seq[next_obs_i]))
                target_q_val = reward_seq[obs_i] + (
                    self.gamma**self.n_step) * (
                        1. - terminal_seq[next_obs_i - 1]) * next_q_value
                target_q_val = invertical_vf(target_q_val)
                target_q_value[i * self.batch_size:(i + 1) *
                               self.batch_size] = target_q_val

            critic_loss = self.actor_criterion(q_value,
                                               target_q_value.detach())
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            # update actor
            self.actor.reset_state()
            self.critic.reset_state()
            actor_loss = torch.zeros(self.learning_length * self.batch_size,
                                     self.n_actions).cuda()
            for i in range(self.learning_length):
                obs_i = i + self.burn_in_length
                action = self.actor(obs_seq[obs_i])
                actor_loss[i * self.batch_size:(i + 1) *
                           self.batch_size] = -self.critic(
                               obs_seq[obs_i], self.actor(obs_seq[obs_i]))
            actor_loss = actor_loss.mean()

            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # update target networks
            if step % self.target_update_inverval == 0:
                self.update_target_model()

            # calc priority
            average_td_loss = np.mean(
                (q_value - target_q_value).detach().cpu().numpy()**2., axis=1)
            for i in range(len(episode_index)):
                td = average_td_loss[i:-1:self.batch_size]
                self.memory.priority[episode_index[i]][
                    sequence_index[i]] = calc_priority(td)
                self.memory.total_priority[episode_index[i]] = sum(
                    self.memory.priority[episode_index[i]])

            if step % self.model_save_interval == 0:
                self.save_model()

            if step % self.memory_update_interval == 0:
                for i in range(self.n_actors):
                    is_memory = os.path.isfile(self.memory_path +
                                               '/memory{}.pt'.format(i))
                    if is_memory:
                        self.memory.load(i)
                    sleep(0.1)

            self.actor.reset_state()
            self.target_actor.reset_state()
            self.critic.reset_state()
            self.target_critic.reset_state()
Beispiel #3
0
class Actor:
    def __init__(self, actor_id, config, dev, shared_state, shared_queue, eps):
        #        self.env = suite.load(domain_name="walker", task_name="run")
        #        self.action_size = self.env.action_spec().shape[0]
        #        self.obs_size = get_obs(self.env.reset().observation).shape[1]

        self.env = env_cover(config, dev)
        self.num_env = config['num_envs']
        self.shared_queue = shared_queue
        self.shared_state = shared_state
        self.dev = dev

        self.actor_id = actor_id
        self.burn_in_length = config['burn_in_length']  # 40-80
        self.learning_length = config['learning_length']
        self.sequence_length = self.burn_in_length + self.learning_length
        self.n_step = config['n_step']
        self.sequence = []
        self.recurrent_state = []
        self.priority = []
        self.td_loss = deque(maxlen=self.learning_length)
        #        self.memory_sequence_size = 1000
        #        self.memory = ReplayMemory(memory_sequence_size=self.memory_sequence_size)
        #        self.memory_save_interval = 3
        self.max_frame = config['actor_max_frame']
        self.gamma = config['gamma']
        #        self.actor_parameter_update_interval = config['actor_parameter_update_interval']
        self.max_shared_q_size = config['max_shared_q_size']

        self.model_path = './'
        self.memory_path = './'

        self.actor = ActorNet(dev, config).to(self.dev)
        self.target_actor = ActorNet(dev, config).to(self.dev)
        self.critic = CriticNet(dev, config).to(self.dev)
        self.target_critic = CriticNet(dev, config).to(self.dev)

        self.actor.load_state_dict(self.shared_state["actor"].state_dict())
        self.target_actor.load_state_dict(
            self.shared_state["target_actor"].state_dict())
        self.critic.load_state_dict(self.shared_state["critic"].state_dict())
        self.target_critic.load_state_dict(
            self.shared_state["target_critic"].state_dict())

        #        self.actor.load_state_dict(self.shared_state["actor"])
        #        self.target_actor.load_state_dict(self.shared_state["target_actor"])
        #        self.critic.load_state_dict(self.shared_state["critic"])
        #        self.target_critic.load_state_dict(self.shared_state["target_critic"])
        self.action_argmax = config['action_argmax']

        #        self.load_model()
        self.epsilon = eps

    def __del__(self):
        self.env.close()

    def PrePro(self, obs):
        return obs
#        return torch.from_numpy(obs).detach().float().reshape((1,self.obs_size)).to(self.dev)

    def save_memory(self):

        model_dict = {
            'sequence': self.sequence,
            'recurrent_state': self.recurrent_state,
            'priority': self.priority,
        }

        torch.save(model_dict, self.memory_path + 'memory.pt')

#    with open('outfile', 'wb') as fp:
#    pickle.dump(itemlist, fp)
#
#    with open ('outfile', 'rb') as fp:
#    itemlist = pickle.load(fp)

    def load_model(self):
        if os.path.isfile(self.model_path + 'model.pt'):
            while True:
                try:
                    # TODO: Delete
                    #                    self.actor = ActorNet(self.obs_size, self.action_size, self.actor_id%2+1).cuda().eval()
                    #                    self.target_actor = deepcopy(self.actor)
                    #                    self.critic = CriticNet(self.obs_size, self.action_size, self.actor_id%2+1).cuda().eval()
                    #                    self.target_critic = deepcopy(self.critic)
                    #model_dict = torch.load(self.model_path + 'model.pt', map_location={'cuda:0':'cuda:{}'.format(self.actor_id%2+1)})
                    print('waiting  model.pt')
                    model_dict = torch.load(self.model_path + 'model.pt')
                    self.actor.load_state_dict(model_dict['actor'])
                    self.target_actor.load_state_dict(
                        model_dict['target_actor'])
                    self.critic.load_state_dict(model_dict['critic'])
                    self.target_critic.load_state_dict(
                        model_dict['target_critic'])
                    self.actor.to(self.dev)
                    self.target_actor.to(self.dev)
                    self.critic.to(self.dev)
                    self.target_critic.to(self.dev)

                except:
                    sleep(np.random.rand() * 5 + 2)
                else:
                    break

    def calc_nstep_reward(self):
        for i in range(len(self.sequence) - self.n_step):
            self.sequence[i][2] = sum([
                self.sequence[i + j][2] * (self.sequence[i + j][3]**j)
                for j in range(self.n_step)
            ])

    def calc_priorities(self):
        with torch.no_grad():
            self.actor.reset_state()
            self.critic.reset_state()
            self.target_actor.reset_state()
            self.target_critic.reset_state()
            #            self.td_loss = deque(maxlen=self.learning_length)
            self.td_loss = []
            self.priority = []

            #       이부분은  target 넷을  nstep 만큼 진행 해놓는것.
            #            for i in range(self.n_step):
            #                next_obs = self.sequence[i][0]
            #                next_action = self.target_actor(self.PrePro(next_obs)).to(self.dev)
            #                next_q_value = self.target_critic(self.PrePro(next_obs), next_action)

            #       n 스텝 진행 하면서 Q 벨류 예측.   seq[시퀀스][0:staet ,1:action ,2:reward,3:term->gamma]
            for i in range(len(self.sequence) - self.n_step):
                #            obs = torch.from_numpy(self.sequence[i][0]).unsqueeze(0)
                #                obs = self.sequence[i][0]
                #    #            action = self.sequence[i][1].unsqueeze(0)
                #                next_obs = self.sequence[i + self.n_step][0]
                #
                #                action = self.sequence[i][1]
                ##                action = torch.Tensor(self.sequence[i][1]).view(1,-1).to(self.dev)
                #    #            next_obs = torch.from_numpy(self.sequence[i + self.n_step][0]).unsqueeze(0)
                #                next_action = self.target_actor(self.PrePro(next_obs)).to(self.dev)
                #
                #                q_value = self.critic(self.PrePro(obs), action)
                #                q_value = q_value.gather(1,action.view(1,-1))
                #                reward = self.sequence[i][2]
                #                gamma = self.sequence[i + self.n_step - 1][3]
                #                next_q_value = self.target_critic(self.PrePro(next_obs),next_action).max(1)[0]
                #
                #                if i >= self.burn_in_length:
                #                    target_q_value = (reward + (gamma ** self.n_step)) * next_q_value
                ##                    target_q_value = invertical_vf(target_q_value)
                #                    self.td_loss.append(((q_value - target_q_value)**2))
                #                    if len(self.td_loss) > self.learning_length:
                #                        self.td_loss.pop(0)

                #                if i >= self.sequence_length:
                #                    self.priority.append(calc_priority(self.td_loss))
                self.priority.append(torch.Tensor([0]))

    def run(self):
        #        sleep(random.random()*1)
        frame = 0
        #        if self.actor_id%3 == 0:
        win_r = vis.line(Y=torch.Tensor([0]),
                         opts=dict(title='reward' + str(self.epsilon)))
        reward_sum = 0

        while frame < self.max_frame:
            #            self.shared_state['frame'][self.actor_id]=frame
            #            while self.shared_state['sleep'][self.actor_id] :
            #                sleep(0.5)

            st, rt, dt = self.env.reset()

            self.actor.reset_state()
            self.critic.reset_state()
            self.target_actor.reset_state()
            self.target_critic.reset_state()

            self.sequence = []
            self.recurrent_state = []
            self.priority = []

            self.td_loss.clear()
            #            if self.actor_id%3 == 0:
            win_r = vis.line(X=torch.Tensor([frame]),
                             Y=torch.Tensor([reward_sum]),
                             win=win_r,
                             update='append')
            qmin = 9999
            qmax = -9999
            pmin = 9999
            pmax = -9999

            reward_sum = 0
            count_step = 0
            sleep(0.01)
            while sum(dt) != self.num_env:

                frame += 1
                # get recurrent state

                action = self.actor(self.PrePro(st))
                Qv = self.critic(self.PrePro(st), action)
                qmax = max(qmax, Qv.max())
                qmin = min(qmin, Qv.min())
                pmax = max(pmax, action.max())
                pmin = min(pmin, action.min())

                #                noise = torch.normal(mean=torch.zeros([self.num_env,1]),std=torch.ones([self.num_env,1])).to(self.dev)
                #                action = action.detach().item() +  np.random.normal(0, self.epsilon, (self.action_size))
                #                action = np.clip(action, -1, 1)
                action = Qv.argmax().view(1, -1)
                if self.epsilon > random.random():
                    action = torch.LongTensor([random.randint(0,
                                                              1)]).view(1, -1)

#                m = torch.distributions.MultivariateNormal(torch.zeros([1,1]), torch.eye(1))
#                action = action + m.sample().to(self.dev)*self.epsilon
##                action  = action.clamp(min=0,max=1)
#
#                if self.action_argmax:
#                    act = action.argmax(1).cpu().numpy().item()
#                else:
#                    act = action.cpu().numpy()

#                action = (action+noise*self.epsilon).clamp(min=-1,max=1)

                st_1, rt, dt = self.env.step(int(action.item()))

                reward_sum += rt
                count_step += 1
                gamma = torch.ones([self.num_env, 1]).to(
                    self.dev) * self.gamma * (1 - dt)
                #                gamma = self.gamma if not dt else 0.
                self.sequence.append([st, action, rt, gamma])
                st = st_1

                #                self.recurrent_state.append([torch.cat([actor_hx, actor_cx]), torch.cat([target_actor_hx, target_actor_cx]),
                #                                                torch.cat([critic_hx, critic_cx]), torch.cat([target_critic_hx, target_critic_cx])])

                #                if True:
                if self.shared_state["update"][self.actor_id]:

                    self.actor.load_state_dict(
                        self.shared_state["actor"].state_dict())
                    self.target_actor.load_state_dict(
                        self.shared_state["target_actor"].state_dict())
                    self.critic.load_state_dict(
                        self.shared_state["critic"].state_dict())
                    self.target_critic.load_state_dict(
                        self.shared_state["target_critic"].state_dict())
                    self.shared_state["update"][self.actor_id] = False
#                    print('actor_update',self.actor.policy_l0.weight.data[0][0])
#                    self.load_model()

            if len(self.sequence) >= self.sequence_length:
                #                self.sequence.extend([(st, action, 0., 0.) for i in range(self.n_step)])
                #                st, rt, dt = self.env.end_dummy()
                #                self.sequence.extend([[st,action, rt, dt] for i in range(self.n_step)])
                st, rt, dt = self.env.end_dummy()
                self.sequence.extend([[st, action, rt, dt]
                                      for i in range(self.n_step)])

                #                self.calc_nstep_reward()
                #                self.calc_priorities()

                for i in range(len(self.sequence)):
                    for j in range(4):
                        self.sequence[i][j] = self.sequence[i][j].cpu()
#                for i in range(len(self.recurrent_state)):
#                    for j in range(4):
#                        self.recurrent_state[i][j] = self.recurrent_state[i][j].cpu()
                for i in range(len(self.priority)):
                    self.priority[i] = self.priority[i].cpu()
                blocking = True if self.shared_queue.qsize(
                ) > self.max_shared_q_size else False
                self.shared_queue.put([self.sequence], block=blocking)

#            if self.actor_id == 0:
            print('\r#',
                  self.actor_id,
                  'frame:',
                  frame,
                  'step:',
                  count_step,
                  'reward: {:.3f}'.format(reward_sum.item()),
                  'qmin,max :{:.3f},{:.3f},  pminmax : {:.3f},{:.3f}'.format(
                      qmin, qmax, pmin, pmax),
                  end='\r')
Beispiel #4
0
class Actor:
    def __init__(self, actor_id):
        self.env = suite.load(domain_name="walker", task_name="run")
        self.action_size = self.env.action_spec().shape[0]
        self.obs_size = get_obs(self.env.reset().observation).shape[1]

        self.actor_id = actor_id
        self.burn_in_length = 20  # 40-80
        self.learning_length = 40
        self.sequence_length = self.burn_in_length + self.learning_length
        self.n_step = 5
        self.sequence = []
        self.recurrent_state = []
        self.priority = []
        self.td_loss = deque(maxlen=self.learning_length)
        self.memory_sequence_size = 1000
        self.memory = ReplayMemory(
            memory_sequence_size=self.memory_sequence_size)
        self.memory_save_interval = 3

        self.gamma = 0.997
        self.actor_parameter_update_interval = 500
        self.model_path = './model_data/'
        self.actor = ActorNet(self.obs_size,
                              self.action_size,
                              cuda_id=self.actor_id % 2 +
                              1).cuda(self.actor_id % 2 + 1).eval()
        self.target_actor = deepcopy(self.actor)
        self.critic = CriticNet(self.obs_size,
                                self.action_size,
                                cuda_id=self.actor_id % 2 +
                                1).cuda(self.actor_id % 2 + 1).eval()
        self.target_critic = deepcopy(self.critic)
        self.load_model()
        self.epsilon = 1
        self.last_obs = None

    def load_model(self):
        if os.path.isfile(self.model_path + 'model.pt'):
            while True:
                try:
                    # TODO: Delete
                    self.actor = ActorNet(self.obs_size, self.action_size,
                                          self.actor_id % 2 + 1).cuda().eval()
                    self.target_actor = deepcopy(self.actor)
                    self.critic = CriticNet(self.obs_size, self.action_size,
                                            self.actor_id % 2 +
                                            1).cuda().eval()
                    self.target_critic = deepcopy(self.critic)
                    #model_dict = torch.load(self.model_path + 'model.pt', map_location={'cuda:0':'cuda:{}'.format(self.actor_id%2+1)})
                    model_dict = torch.load(self.model_path + 'model.pt')
                    self.actor.load_state_dict(model_dict['actor'])
                    self.target_actor.load_state_dict(
                        model_dict['target_actor'])
                    self.critic.load_state_dict(model_dict['critic'])
                    self.target_critic.load_state_dict(
                        model_dict['target_critic'])
                    self.actor.cuda(self.actor_id % 2 + 1)
                    self.target_actor.cuda(self.actor_id % 2 + 1)
                    self.critic.cuda(self.actor_id % 2 + 1)
                    self.target_critic.cuda(self.actor_id % 2 + 1)
                except:
                    sleep(np.random.rand() * 5 + 2)
                else:
                    break

    def calc_nstep_reward(self):
        for i in range(len(self.sequence) - self.n_step):
            self.sequence[i][2][0] = sum([
                self.sequence[i + j][2][0] * (self.gamma**j)
                for j in range(self.n_step)
            ])

    def calc_priorities(self):
        self.actor.reset_state()
        self.critic.reset_state()
        self.target_actor.reset_state()
        self.target_critic.reset_state()
        self.td_loss = deque(maxlen=self.learning_length)
        self.priority = []

        for i in range(self.n_step):
            next_obs = torch.from_numpy(
                self.sequence[i][0]).cuda(self.actor_id % 2 + 1).unsqueeze(0)
            next_action = self.target_actor(next_obs)
            next_q_value = self.target_critic(
                next_obs, next_action).detach().cpu().numpy()

        for i in range(len(self.sequence) - self.n_step):
            obs = torch.from_numpy(
                self.sequence[i][0]).cuda(self.actor_id % 2 + 1).unsqueeze(0)
            action = torch.from_numpy(
                self.sequence[i][1]).cuda(self.actor_id % 2 + 1).unsqueeze(0)
            next_obs = torch.from_numpy(
                self.sequence[i + self.n_step][0]).cuda(self.actor_id % 2 +
                                                        1).unsqueeze(0)
            next_action = self.target_actor(next_obs)

            q_value = self.critic(obs, action).detach().cpu().numpy()
            reward = self.sequence[i][2][0]
            terminal = self.sequence[i + self.n_step - 1][3][0]
            next_q_value = self.target_critic(
                next_obs, next_action).detach().cpu().numpy()

            if i >= self.burn_in_length:
                target_q_value = (reward + (self.gamma**self.n_step) *
                                  (1. - terminal) * next_q_value)
                target_q_value = invertical_vf(
                    torch.tensor(target_q_value).cuda(
                        self.actor_id % 2 + 1)).detach().cpu().numpy()
                self.td_loss.append((q_value - target_q_value).mean())
            if i >= self.sequence_length:
                self.priority.append(
                    calc_priority(
                        np.array(list(self.td_loss), dtype=np.float32)**2.))

    def run(self):
        episode = 0
        step = 0
        reward_sum = 0

        while True:
            time_step = self.env.reset()
            obs = get_obs(time_step.observation)
            self.actor.reset_state()
            self.critic.reset_state()
            self.target_actor.reset_state()
            self.target_critic.reset_state()
            self.sequence = []
            self.recurrent_state = []
            self.priority = []
            self.td_loss.clear()
            last_obs = None
            episode_step = 0
            done = False
            if self.actor_id == 0 and episode != 0:
                print('episode:', episode, 'step:', step, 'reward:',
                      reward_sum)
            episode += 1
            reward_sum = 0

            while not time_step.last():

                # get recurrent state
                actor_hx, actor_cx = self.actor.get_state()
                target_actor_hx, target_actor_cx = self.target_actor.get_state(
                )
                critic_hx, critic_cx = self.critic.get_state()
                target_critic_hx, target_critic_cx = self.target_critic.get_state(
                )

                action = self.actor(
                    torch.from_numpy(obs).cuda(self.actor_id % 2 + 1))
                target_action = self.target_actor(
                    torch.from_numpy(obs).cuda(self.actor_id % 2 + 1))
                _ = self.critic(
                    torch.from_numpy(obs).cuda(self.actor_id % 2 + 1), action)
                _ = self.target_critic(
                    torch.from_numpy(obs).cuda(self.actor_id % 2 + 1),
                    target_action)

                action = action.detach().cpu().numpy()[0]
                action += np.random.normal(0, 0.3, (self.action_size))
                action = np.clip(action, -1, 1)

                reward = 0.
                sleep(0.01)
                for i in range(4):
                    time_step = self.env.step(action)
                    next_obs = get_obs(time_step.observation)
                    reward += time_step.reward
                    if time_step.last():
                        break

                reward_sum += reward
                step += 1
                episode_step += 1
                terminal = 1. if time_step.last() else 0.
                self.sequence.append((obs[0], action, [reward], [terminal]))
                obs = next_obs.copy()

                self.recurrent_state.append(
                    [[actor_hx[0], actor_cx[0]],
                     [target_actor_hx[0], target_actor_cx[0]],
                     [critic_hx[0], critic_cx[0]],
                     [target_critic_hx[0], target_critic_cx[0]]])

                if step % self.actor_parameter_update_interval == 0:
                    self.load_model()

            if len(self.sequence) >= self.sequence_length:
                self.sequence.extend([(np.zeros((self.obs_size),
                                                dtype=np.float32),
                                       np.zeros((self.action_size),
                                                dtype=np.float32), [0.], [1.])
                                      for i in range(self.n_step)])
                self.calc_nstep_reward()
                self.calc_priorities()
                self.memory.add(self.sequence, self.recurrent_state,
                                self.priority)

            if len(self.memory.memory) > self.memory_save_interval:
                self.memory.save(self.actor_id)