Beispiel #1
0
class A3CLocal:
    def __init__(self, config):
        self.config = config

        # 리플레이메모리
        self.replay_memory = deque(maxlen=self.config.n_replay_memory)

        # 정책신경망 생성
        self.actor = PolicyNet(self.config.n_state, self.config.n_action)
        self.actor.to(device)

        # 가치신경망 생성
        self.critic = ValueNet(self.config.n_state, 1)
        self.critic.to(device)

    # 정책신경망의 출력을 받아 확률적으로 행동을 선택
    def get_action(self, state):
        state = torch.tensor(state, dtype=torch.float).to(device)
        policy = self.actor(state)
        policy = policy.detach().cpu().numpy()[0]
        return np.random.choice(self.config.n_action, 1, p=policy)[0]

    # 리플에이메모리 추가
    def append_replay(self, state, action, reward, next_state):
        act = np.zeros(self.config.n_action)
        act[action] = 1
        self.replay_memory.append((state, act, reward, next_state))

    # 리플에이메모리 조회 및 클리어
    def get_replay(self):
        # 히스토리를 배열 형태로 정렬
        replay_memory = np.array(self.replay_memory)
        self.replay_memory.clear()
        states = np.vstack(replay_memory[:, 0])
        actions = list(replay_memory[:, 1])
        rewards = list(replay_memory[:, 2])
        next_states = list(replay_memory[:, 3])

        return states, actions, rewards, next_states

    # 글로벌신병망의 wegith를 로컬신경망으로 복사
    def update_local_model(self, actor_dict, critic_dict):
        self.actor.load_state_dict(actor_dict)
        self.critic.load_state_dict(critic_dict)

    # GPU 메모리 반납
    def close(self):
        del self.actor
        del self.critic
Beispiel #2
0
class A3CGlobal:
    def __init__(self, config):
        self.config = config

        # 정책신경망 생성
        self.actor = PolicyNet(self.config.n_state, self.config.n_action)
        self.actor.to(device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=self.config.actor_lr)

        # 가치신경망 생성
        self.critic = ValueNet(self.config.n_state, 1)
        self.critic.to(device)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=self.config.critic_lr)

    # 리턴값 계산
    def get_returns(self, rewards, done, next_value):
        returns = torch.zeros(len(rewards),
                              dtype=torch.float).to(self.config.device)
        R = 0 if done else next_value
        for i in reversed(range(0, len(rewards))):
            R = rewards[i] + self.config.discount_factor * R
            returns[i] = R
        return returns

    # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트
    def train_model(self, states, actions, rewards, next_states, done):
        states = torch.tensor(states, dtype=torch.float).to(self.config.device)
        actions = torch.tensor(actions,
                               dtype=torch.float).to(self.config.device)
        next_states = torch.tensor(next_states,
                                   dtype=torch.float).to(self.config.device)

        next_values = self.critic(next_states).view(-1)

        # 리턴값 계산
        returns = self.get_returns(rewards, done, next_values[-1])

        values = self.critic(states).view(-1)

        # 가치신경망 학습
        critic_loss = self.train_critic(values, returns)
        # 정책신경망 학습
        actor_loss = self.train_actor(states, actions, returns - values)

        return actor_loss, critic_loss

    # 정책신경망을 업데이트하는 함수
    def train_actor(self, states, actions, advantages):
        policy = self.actor(states)
        action_prob = torch.sum(actions * policy, dim=1)
        cross_entropy = torch.log(action_prob + 1.e-7) * advantages.detach()
        actor_loss = -torch.mean(cross_entropy)

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        return actor_loss.item()

    # 가치신경망을 업데이트하는 states
    def train_critic(self, values, targets):
        critic_loss = torch.mean(torch.pow(targets - values, 2))

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        return critic_loss.item()

    # GPU 메모리 반납
    def close(self):
        del self.actor
        del self.critic
Beispiel #3
0
class PGAgent:
    def __init__(self, config):
        self.config = config

        # replay memory
        self.replay_memory = deque(maxlen=self.config.n_replay_memory)

        # 정책신경망 생성
        self.model = PolicyNet(self.config.n_state, self.config.n_action)
        self.model.to(device)
        self.model_optimizer = torch.optim.Adam(self.model.parameters(),
                                                lr=self.config.learning_rate)

    # 정책신경망의 출력을 받아 확률적으로 행동을 선택
    def get_action(self, state):
        state = torch.tensor(state, dtype=torch.float).to(device)
        policy = self.model(state)
        policy = policy.detach().cpu().numpy()[0]
        return np.random.choice(self.config.n_action, 1, p=policy)[0]

    # 히스토리 추가
    def append_replay(self, state, action, reward, next_state):
        act = np.zeros(self.config.n_action)
        act[action] = 1
        self.replay_memory.append((state, act, reward, next_state))

    # 리턴값 계산
    def get_returns(self, rewards):
        returns = torch.zeros(len(rewards),
                              dtype=torch.float).to(self.config.device)
        R = 0
        for i in reversed(range(0, len(rewards))):
            R = rewards[i] + self.config.discount_factor * R
            returns[i] = R
        if 1 < len(returns):
            returns -= torch.mean(returns)
            returns /= (torch.std(returns) + 1.e-7)
        return returns

    # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트
    def train_model(self):
        # 히스토리를 배열 형태로 정렬
        replay_memory = np.array(self.replay_memory)
        self.replay_memory.clear()
        states = np.vstack(replay_memory[:, 0])
        actions = list(replay_memory[:, 1])
        rewards = list(replay_memory[:, 2])
        next_states = list(replay_memory[:, 3])

        states = torch.tensor(states, dtype=torch.float).to(self.config.device)
        actions = torch.tensor(actions,
                               dtype=torch.float).to(self.config.device)

        # 리턴값 계산
        returns = self.get_returns(rewards)

        loss = self.train_policy(states, actions, returns)

        return loss

    # 정책신경망을 업데이트하는 함수
    def train_policy(self, states, actions, returns):
        policy = self.model(states)
        action_prob = torch.sum(actions * policy, dim=1)
        cross_entropy = torch.log(action_prob + 1.e-7) * returns
        loss = -torch.mean(cross_entropy)

        self.model_optimizer.zero_grad()
        loss.backward()
        self.model_optimizer.step()

        return loss.item()

    # model의 weight를 파일로 저장
    def save(self):
        torch.save(self.model.state_dict(), self.config.save_file)

    # 파일로 부터 model의 weight를 읽어 옴
    def load(self):
        self.model.load_state_dict(torch.load(self.config.save_file))

    # GPU 메모리 반납
    def close(self):
        del self.model
Beispiel #4
0
class A2CAgent:
    def __init__(self, config):
        self.config = config

        # replay memory
        self.replay_memory = deque(maxlen=self.config.n_replay_memory)

        # 정책신경망 생성
        self.actor = PolicyNet(self.config.n_state, self.config.n_action)
        self.actor.to(device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.actor_lr)

        # 가치신경망 생성
        self.critic = ValueNet(self.config.n_state, 1)
        self.critic.to(device)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.critic_lr)

    # 정책신경망의 출력을 받아 확률적으로 행동을 선택
    def get_action(self, state):
        state = torch.tensor(state, dtype=torch.float).to(device)
        policy = self.actor(state)
        policy = policy.detach().cpu().numpy()[0]
        return np.random.choice(self.config.n_action, 1, p=policy)[0]

    # 히스토리 추가
    def append_replay(self, state, action, reward, next_state):
        act = np.zeros(self.config.n_action)
        act[action] = 1
        self.replay_memory.append((state, act, reward, next_state))

    # 리턴값 계산
    def get_returns(self, rewards, done, next_value):
        returns = torch.zeros(len(rewards), dtype=torch.float).to(self.config.device)
        R = 0 if done else next_value
        for i in reversed(range(0, len(rewards))):
            R = rewards[i] + self.config.discount_factor * R
            returns[i] = R
        return returns

    # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트
    def train_model(self, done):
        # 히스토리를 배열 형태로 정렬
        replay_memory = np.array(self.replay_memory)
        self.replay_memory.clear()
        states = np.vstack(replay_memory[:, 0])
        actions = list(replay_memory[:, 1])
        rewards = list(replay_memory[:, 2])
        next_states = list(replay_memory[:, 3])

        states = torch.tensor(states, dtype=torch.float).to(self.config.device)
        actions = torch.tensor(actions, dtype=torch.float).to(self.config.device)
        next_states = torch.tensor(next_states, dtype=torch.float).to(self.config.device)

        next_values = self.critic(next_states).view(-1)

        # 리턴값 계산
        returns = self.get_returns(rewards, done, next_values[-1])

        values = self.critic(states).view(-1)

        # 가치신경망 학습
        critic_loss = self.train_critic(values, returns)
        # 정책신경망 학습
        actor_loss = self.train_actor(states, actions, returns - values)

        return actor_loss, critic_loss
    
    # 정책신경망을 업데이트하는 함수
    def train_actor(self, states, actions, advantages):
        policy = self.actor(states)
        action_prob = torch.sum(actions * policy, dim=1)
        cross_entropy = torch.log(action_prob + 1.e-7) * advantages.detach()
        actor_loss = -torch.mean(cross_entropy)

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        return actor_loss.item()
    
    # 가치신경망을 업데이트하는 states
    def train_critic(self, values, targets):
        critic_loss = torch.mean(torch.pow(targets - values, 2))

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        return critic_loss.item()

    # model의 weight를 파일로 저장
    def save(self):
        torch.save(self.actor.state_dict(), self.config.save_file + ".actor")
        torch.save(self.critic.state_dict(), self.config.save_file + ".critic")
    
    # 파일로 부터 model의 weight를 읽어 옴
    def load(self):
        self.actor.load_state_dict(torch.load(self.config.save_file + ".actor"))
        self.critic.load_state_dict(torch.load(self.config.save_file + ".critic"))
    
    # GPU 메모리 반납
    def close(self):
        del self.actor
        del self.critic