Ejemplo n.º 1
0
class Brain:
    def __init__(self, num_actions, Double, Dueling, PER):
        self.num_actions = num_actions  # 행동 가짓수(2)를 구함
        self.Double = Double
        self.Dueling = Dueling
        self.PER = PER

        # transition을 기억하기 위한 메모리 객체 생성
        self.memory = ReplayMemory(CAPACITY)

        # 신경망 구성
        n_out = num_actions
        self.main_q_network = Net_CNN(n_out, Dueling)  # Net 클래스를 사용
        self.target_q_network = Net_CNN(n_out, Dueling)  # Net 클래스를 사용
        print(self.main_q_network)  # 신경망의 구조를 출력

        # 최적화 기법을 선택
        self.optimizer = optim.Adam(self.main_q_network.parameters(),
                                    lr=0.0001)

        # PER - TD 오차를 기억하기 위한 메모리 객체 생성
        if self.PER == True:
            self.td_error_memory = TDerrorMemory(CAPACITY)

    def replay(self, episode=0):
        ''' Experience Replay로 신경망의 결합 가중치 학습 '''

        # 1. 저장된 transition 수 확인
        if len(self.memory) < BATCH_SIZE:
            return

        # 2. 미니배치 생성
        if self.PER == True:
            self.batch, self.state_batch, self.action_batch, self.reward_batch, self.non_final_next_states = self.make_minibatch(
                episode)
        else:
            self.batch, self.state_batch, self.action_batch, self.reward_batch, self.non_final_next_states = self.make_minibatch(
            )

        # 3. 정답신호로 사용할 Q(s_t, a_t)를 계산
        self.expected_state_action_values = self.get_expected_state_action_values(
        )

        # 4. 결합 가중치 수정
        self.update_main_q_network()

    def decide_action(self, state, episode):
        '''현재 상태로부터 행동을 결정함'''
        # e-greedy 알고리즘에서 서서히 최적행동의 비중을 늘린다
        epsilon = 0.5 * (1 / (episode + 1))

        if epsilon <= np.random.uniform(0, 1):
            self.main_q_network.eval()  # 신경망을 추론 모드로 전환
            with torch.no_grad():
                action = self.main_q_network(state).max(1)[1].view(1, 1)
            # 신경망 출력의 최댓값에 대한 인덱스 = max(1)[1]
            # .view(1,1)은 [torch.LongTensor of size 1]을 size 1*1로 변환하는 역할을 함

        else:
            # 행동을 무작위로 반환 (0 혹은 1)
            action = torch.LongTensor([[random.randrange(self.num_actions)]
                                       ])  #행동을 무작위로 반환(0 혹은 1)
            # action은 [torch.LongTensor of size 1*1] 형태가 된다.

        return action

    def make_minibatch(self, episode=0):
        '''2. 미니배치 생성'''

        if self.PER == True:
            # 2.1 PER - 메모리 객체에서 미니배치를 추출
            # def make_minibatch(self, episode):
            if episode < 30:
                transitions = self.memory.sample(BATCH_SIZE)
            else:
                # TD 오차를 이용해 미니배치를 추출하도록 수정
                indexes = self.td_error_memory.get_prioritized_indexes(
                    BATCH_SIZE)
                transitions = [self.memory.memory[n] for n in indexes]
        else:
            # 2.1 메모리 객체에서 미니배치를 추출
            transitions = self.memory.sample(BATCH_SIZE)

        # 2.2 각 변수를 미니배치에 맞는 형태로 변형
        # transitions는 각 단계별로 (state, action, state_next, reward) 형태로 BATCH_SIZE 개수만큼 저장됨
        # 다시 말해, (state, action, state_next, reward) * BATCH_SIZE 형태가 된다.
        # 이를 미니배치로 만들기 위해
        # (state*BATCH_SIZE, action*BATCH_SIZE), state_next*BATCH_SIZE, reward*BATCH_SIZE)
        # 형태로 변환한다.

        batch = Transition(*zip(*transitions))

        # 2.3 각 변수의 요소를 미니배치에 맞게 변형하고, 신경망으로 다룰 수 있게 Variable로 만든다
        # state를 예로 들면, [torch.FloatTensor of size 1*4] 형태의 요소가 BATCH_SIZE 개수만큼 있는 형태다
        # 이를 torch.FloatTensor of size BATCH_SIZE*4 형태로 변형한다
        # 상태, 행동, 보상, non_final 상태로 된 미니배치를 나타내는 Variable을 생성
        # cat은 Concatenates(연접)를 의미한다.
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        non_final_next_states = torch.cat(
            [s for s in batch.next_state if s is not None])

        return batch, state_batch, action_batch, reward_batch, non_final_next_states

    def get_expected_state_action_values(self):
        ''' 정답 신호로 사용할 Q(s_t,a_t)를 계산'''

        # 3.1 신경망을 추론 모드로 전환
        self.main_q_network.eval()
        self.target_q_network.eval()

        # 3.2 신경망으로 Q(s_t, a_t)를 계산
        # self.model(state_batch)은 왼쪽, 오른쪽에 대한 Q값을 출력하며
        # [torch.FloatTensor of size BATCH_SIZEx2] 형태다
        # 여기서부터는 실행한 행동 a_t에 대한 Q값을 계산하므로 action_batch에서 취한 행동
        # a_t가 왼쪽이냐 오른쪽이냐에 대한 인덱스를 구하고, 이에 대한 Q값을 gather메서드로 모아온다.
        self.state_action_values = self.main_q_network(
            self.state_batch).gather(1, self.action_batch)

        # 3.3 max{Q(s_t+1, a)}값을 계산한다. 이때 다음 상태가 존재하는지에 주의해야 한다

        # cartpole이 done 상태가 아니고, next_state가 존재하는지 확인하는 인덱스 마스크를 만듬
        non_final_mask = torch.ByteTensor(
            tuple(map(lambda s: s is not None, self.batch.next_state)))

        # 먼저 전체를 0으로 초기화
        next_state_values = torch.zeros(BATCH_SIZE)

        # Double DQN
        if self.Double == True:
            a_m = torch.zeros(BATCH_SIZE).type(torch.LongTensor)

            # 다음 상태에서 Q값이 최대가 되는 행동 a_m을 Main Q-Network로 계산
            # 마지막에 붙은 [1]로 행동에 해당하는 인덱스를 구함
            a_m[non_final_mask] = self.main_q_network(
                self.non_final_next_states).detach().max(1)[1]

            # 다음 상태가 있는 것만을 걸러내고, size 32를 32*1로 변환
            a_m_non_final_next_states = a_m[non_final_mask].view(-1, 1)

            # 다음 상태가 있는 인덱스에 대해 행동 a_m의 Q값을 target Q-Network로 계산
            # detach() 메서드로 값을 꺼내옴
            # squeeze()메서드로 size[minibatch*1]을 [minibatch]로 변환
            next_state_values[non_final_mask] = self.target_q_network(
                self.non_final_next_states).gather(
                    1, a_m_non_final_next_states).detach().squeeze()
        else:
            # 다음 상태가 있는 인덱스에 대한 최대 Q값을 구한다
            # 출력에 접근해서 열방향 최댓값(max(1))이 되는 [값, 인덱스]를 구한다
            # 그리고 이 Q값(인덱스 = 0)을 출력한 다음
            # detach 메서드로 이 값을 꺼내온다
            next_state_values[non_final_mask] = self.target_q_network(
                self.non_final_next_states).max(1)[0].detach()

        # 3.4 정답신호로 사용할 Q(s_t, a_t) 값을 Q러닝 식으로 계산
        expected_state_action_values = self.reward_batch + GAMMA * next_state_values

        return expected_state_action_values

    def update_main_q_network(self):
        ''' 4. 결합 가중치 수정 '''

        # 4.1 신경망을 학습 모드로 전환
        self.main_q_network.train()

        # 4.2 손실함수를 계산(smooth_l1_loss는 Huber 함수)
        # expected_state_action_values은 size가 [minibatch]이므로 unsqueeze해서 [minibatch*1]로 만듦
        loss = F.smooth_l1_loss(self.state_action_values,
                                self.expected_state_action_values.unsqueeze(1))

        # 4.3 결합 가중치를 수정
        self.optimizer.zero_grad()  # 경사를 초기화
        loss.backward()  # 역전파 계산
        self.optimizer.step()  # 결합 가중치 수정

    def update_target_q_network(self):  # DDQN에서 추가됨
        ''' Target Q-Network을 Main Q-Network와 맞춤 '''
        self.target_q_network.load_state_dict(self.main_q_network.state_dict())

    def update_td_error_memory(self):  # Prioritized Experience Replay 에서 추가됨
        ''' TD 오차 메모리에 저장된 TD 오차를 업데이트 '''

        # 신경망을 추론 모드로 전환
        self.main_q_network.eval()
        self.target_q_network.eval()

        # 전체 transition으로 미니배치를 생성
        transitions = self.memory.memory
        batch = Transition(*zip(*transitions))

        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        non_final_next_states = torch.cat(
            [s for s in batch.next_state if s is not None])

        # 신경망의 출력 Q(s_t, a_t)를 계산
        state_action_values = self.main_q_network(state_batch).gather(
            1, action_batch)

        # cartpole이 done 상태가 아니고, next_state가 존재하는지 확인하는 인덱스 마스크를 만듦
        non_final_mask = torch.ByteTensor(
            tuple(map(lambda s: s is not None, batch.next_state)))

        # 먼저 전체를 0으로 초기화, 크기는 기억한 transition 개수만큼
        next_state_values = torch.zeros(len(self.memory))
        a_m = torch.zeros(len(self.memory)).type(torch.LongTensor)

        # 다음 상태에서 Q값이 최대가 되는 행동 a_m을 Main Q-Network로 계산
        # 마지막에 붙은 [1]로 행동에 해당하는 인덱스를 구함
        a_m[non_final_mask] = self.main_q_network(
            non_final_next_states).detach().max(1)[1]

        # 다음 상태가 있는 것만을 걸러내고, size 32를 32*1 로 변환
        a_m_non_final_next_states = a_m[non_final_mask].view(-1, 1)

        # 다음 상태가 있는 인덱스에 대해 행동 a_m의 Q값을 target Q-Network로 계산
        # detach() 메서드로 값을 꺼내옴
        # squeeze() 메서드로 size[minibatch*1]을 [minibatch]로 변환
        next_state_values[non_final_mask] = self.target_q_network(
            non_final_next_states).gather(
                1, a_m_non_final_next_states).detach().squeeze()

        # TD 오차를 계산
        td_errors = (reward_batch + GAMMA *
                     next_state_values) - state_action_values.squeeze()
        # state_action_values는 size[minibatch*1]이므로 squeeze 메서드로 size[minibatch]로 변환

        # TD 오차 메모리를 업데이트. Tensor를 detach() 메서드로 꺼내와 NumPy 변수로 변환하고
        # 다시 파이썬 리스트로 반환
        self.td_error_memory.memory = td_errors.detach().numpy().tolist()
Ejemplo n.º 2
0
class Learner():
    def __init__(self, sess, s_size, a_size, scope, queues, trainer):
        self.queue = queues[0]
        self.param_queue = queues[1]
        self.replaymemory = ReplayMemory(100000)
        self.sess = sess
        self.learner_net = network(s_size, a_size, scope, 20)

        self.q = self.learner_net.q
        self.Q = self.learner_net.Q

        self.actions_q = tf.placeholder(shape=[None, a_size, N],
                                        dtype=tf.float32)
        self.q_target = tf.placeholder(shape=[None, N], dtype=tf.float32)
        self.ISWeights = tf.placeholder(shape=[None, N], dtype=tf.float32)

        self.q_actiona = tf.multiply(self.q, self.actions_q)
        self.q_action = tf.reduce_sum(self.q_actiona, axis=1)
        self.u = tf.abs(self.q_target - self.q_action)
        self.loss = tf.reduce_mean(
            tf.reduce_sum(tf.square(self.u) * self.ISWeights, axis=1))

        self.local_vars = self.learner_net.local_vars  #tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
        self.gradients = tf.gradients(self.loss, self.local_vars)
        #grads,self.grad_norms = tf.clip_by_norm(self.gradients,40.0)
        self.apply_grads = trainer.apply_gradients(
            zip(self.gradients, self.local_vars))
        self.sess.run(tf.global_variables_initializer())

    def run(self, gamma, s_size, a_size, batch_size, env):
        print('start learning')
        step, train1 = 0, False
        epi_q = []
        self.env = env
        while True:
            if self.queue.empty():
                pass
            else:
                while not self.queue.empty():
                    t_error = self.queue.get()
                    step += 1
                    self.replaymemory.add(t_error)

            if self.param_queue.empty():
                params = self.sess.run(self.local_vars)
                self.param_queue.put(params)

            if step >= 10000:
                train1 = True
                step = 0

            if train1 == True:
                episode_buffer, tree_idx, ISWeights = self.replaymemory.sample(
                    batch_size)
                #print 'fadsfdasfadsfa'
                episode_buffer = np.array(episode_buffer)
                #print episode_buffer
                observations = episode_buffer[:, 0]
                actions = episode_buffer[:, 1]
                rewards = episode_buffer[:, 2]
                observations_next = episode_buffer[:, 3]
                dones = episode_buffer[:, 4]
                Q_target = self.sess.run(self.Q,
                                         feed_dict={
                                             self.learner_net.inputs:
                                             np.vstack(observations_next)
                                         })

                actions_ = np.argmax(Q_target, axis=1)

                action = np.zeros((batch_size, a_size))
                action_ = np.zeros((batch_size, a_size))
                for i in range(batch_size):
                    action[i][actions[i]] = 1
                    action_[i][actions_[i]] = 1
                action_now = np.zeros((batch_size, a_size, N))
                action_next = np.zeros((batch_size, a_size, N))
                for i in range(batch_size):
                    for j in range(a_size):
                        for k in range(N):
                            action_now[i][j][k] = action[i][j]
                            action_next[i][j][k] = action_[i][j]
                q_target = self.sess.run(self.q_action,
                                         feed_dict={
                                             self.learner_net.inputs:
                                             np.vstack(observations_next),
                                             self.actions_q:
                                             action_next
                                         })

                q_target_batch = []
                for i in range(len(q_target)):
                    qi = q_target[i]
                    z_target_step = []
                    for j in range(len(qi)):
                        z_target_step.append(gamma * qi[j] * (1 - dones[i]) +
                                             rewards[i])
                    q_target_batch.append(z_target_step)
                q_target_batch = np.array(q_target_batch)

                isweight = np.zeros((batch_size, N))
                for i in range(batch_size):
                    for j in range(N):
                        isweight[i, j] = ISWeights[i]
                feed_dict = {
                    self.q_target: q_target_batch,
                    self.learner_net.inputs: np.vstack(observations),
                    self.actions_q: action_now,
                    self.ISWeights: isweight
                }

                l, abs_errors, _ = self.sess.run(
                    [self.loss, self.u, self.apply_grads], feed_dict=feed_dict)
                #print abs_errors
                abs_errors = np.mean(abs_errors, axis=1) + 1e-6

                self.replaymemory.update_priorities(tree_idx, abs_errors)
class Agent:
    def __init__(self, policy_net, target_net, durability, optimizer, name,
                 constants):
        """An agent class that takes action on the environment and optimizes
        the action based on the reward.

        Parameters
        ----------
        policy_net : DQN
            [description]
        target_net : DQN
            [description]
        durability : int
            [description]
        optimizer : [type]
            [description]
        name : str
            The name of agent
        constants: Constants
            The hyper-parameters from Constants class
        """
        self.CONSTANTS = constants
        self.policy_net = policy_net
        self.target_net = target_net
        self.target_net.load_state_dict(policy_net.state_dict())
        self.durability = durability
        self.optimizer = optimizer
        self.name = name
        self.memory = ReplayMemory(self.CONSTANTS.MEMORY_SIZE)
        self.steps_done = 0
        self.total_reward = 0.0
        self.reward = 0.0
        self.obtained_reward = 0.0
        self.n_best = 0
        self.policy_net_flag = False

    def select_action(self, state, is_first=False):
        sample = random.random()
        eps_threshold = self.CONSTANTS.EPS_END + (self.CONSTANTS.EPS_START - self.CONSTANTS.EPS_END) * \
                        math.exp(-1. * self.steps_done / self.CONSTANTS.EPS_DECAY)
        self.steps_done += 1
        if is_first:
            self.writer.add_graph(self.policy_net,
                                  input_to_model=state.to(
                                      self.CONSTANTS.DEVICE),
                                  profile_with_cuda=True)
        if sample > eps_threshold:
            with torch.no_grad():
                self.policy_net_flag = True
                return self.policy_net(state.to(
                    self.CONSTANTS.DEVICE)).max(1)[1].view(1, 1)
        else:
            return torch.tensor([[random.randrange(self.CONSTANTS.N_ACTIONS)]],
                                device=self.CONSTANTS.DEVICE,
                                dtype=torch.long)

    def select_core_action(self, best_agent_state, flag, best_agent_action):
        self.steps_done += 1
        if flag:
            with torch.no_grad():
                if best_agent_state is None:
                    return self.policy_net(self.state.to(
                        self.CONSTANTS.DEVICE)).max(1)[1].view(1, 1)
                else:
                    return self.policy_net(
                        best_agent_state.to(
                            self.CONSTANTS.DEVICE)).max(1)[1].view(1, 1)
        else:
            return best_agent_action

    def optimize_model(self):
        if len(self.memory) < self.CONSTANTS.BATCH_SIZE:
            return
        transitions = self.memory.sample(self.CONSTANTS.BATCH_SIZE)

        # zip(*transitions) unzips the transitions into
        # Transition(*) creates new named tuple
        # batch.state - tuple of all the states (each state is a tensor)
        # batch.next_state - tuple of all the next states (each state is a tensor)
        # batch.reward - tuple of all the rewards (each reward is a float)
        # batch.action - tuple of all the actions (each action is an int)

        # Transition = ReplayMemory.get_transition()
        transition = self.CONSTANTS.TRANSITION
        batch = transition(*zip(*transitions))

        actions = tuple(
            (map(lambda a: torch.tensor([[a]], device=self.CONSTANTS.DEVICE),
                 batch.action)))
        rewards = tuple(
            (map(lambda r: torch.tensor([r], device=self.CONSTANTS.DEVICE),
                 batch.reward)))

        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, batch.next_state)),
                                      device=utils.get_device(),
                                      dtype=torch.bool)

        non_final_next_states = torch.cat([
            s for s in batch.next_state if s is not None
        ]).to(self.CONSTANTS.DEVICE)

        state_batch = torch.cat(batch.state).to(self.CONSTANTS.DEVICE)
        action_batch = torch.cat(actions)
        reward_batch = torch.cat(rewards)

        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)

        next_state_values = torch.zeros(self.CONSTANTS.BATCH_SIZE,
                                        device=self.CONSTANTS.DEVICE)
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0].detach()
        expected_state_action_values = (next_state_values *
                                        self.CONSTANTS.GAMMA) + reward_batch

        loss = F.smooth_l1_loss(state_action_values,
                                expected_state_action_values.unsqueeze(1))

        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def set_tf_writer(self, path):
        self.writer = self._set_tf_writer(path)

    def _set_tf_writer(self, path):
        if self.name == "core":
            writer = SummaryWriter(log_dir="{}/tf-board/core/".format(path))
        else:
            writer = SummaryWriter(
                log_dir="{}/tf-board/{}".format(path, self.name))
        return writer

    def get_state(self):
        return self.state

    def get_next_state(self):
        return self.next_state

    def get_init_state(self):
        return self.init_state

    def get_name(self):
        return self.name

    def get_policy_net_flag(self):
        return self.policy_net_flag

    def set_init_state(self, state):
        self.init_state = state

    def set_state(self, state):
        self.state = state
        self.next_state = state

    def set_env(self, env):
        self.env = env

    def get_env(self):
        return self.env

    def set_action(self, action):
        self.action = action

    def get_action(self):
        return self.action

    def get_durability(self):
        return self.durability

    def get_policy_net(self):
        return self.policy_net

    def reduce_durability(self, value):
        self.durability = self.durability - value

    def heal_durability(self, value):
        self.durability = self.durability + value

    def set_done_state(self, done):
        self.done = done

    def set_total_reward(self, reward):
        self.reward = reward
        if reward > 0.0:
            self.obtained_reward += reward
        self.total_reward += reward

    def reset_total_reward(self):
        self.total_reward = 0.0
        self.obtained_reward = 0.0

    def get_reward(self):
        return self.reward

    def get_obtained_reward(self):
        return self.obtained_reward

    def best_counter(self):
        self.n_best += 1

    def get_n_best(self):
        return self.n_best

    def get_total_reward(self):
        return self.total_reward

    def set_step_retrun_value(self, obs, done, info):
        self.obs = obs
        self.done = done
        self.info = info

    def is_done(self):
        return self.done
Ejemplo n.º 4
0
class Worker():
    def __init__(self, env, name, s_size, a_size, trainer, model_path,
                 global_episodes):
        self.name = "worker_" + str(name)
        self.number = name
        self.model_path = model_path
        self.trainer = trainer
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_mean_values = []
        #Create the local copy of the network and the tensorflow op to copy global paramters to local network
        self.local_Q = Q_Network(s_size, a_size, self.name, trainer)
        self.update_local_ops = update_target_graph('global', self.name)
        self.env = env
        self.replaymemory = ReplayMemory(max_memory)

    def train(self, rollout, sess, gamma, ISWeights):
        rollout = np.array(rollout)
        observations = rollout[:, 0]
        actions = rollout[:, 1]
        rewards = rollout[:, 2]
        next_observations = rollout[:, 3]
        dones = rollout[:, 4]

        Q_target = sess.run(
            self.local_Q.Q,
            feed_dict={self.local_Q.inputs: np.vstack(next_observations)})
        actions_ = np.argmax(Q_target, axis=1)

        action = np.zeros((batch_size, a_size))
        action_ = np.zeros((batch_size, a_size))
        for i in range(batch_size):
            action[i][actions[i]] = 1
            action_[i][actions_[i]] = 1
        action_now = np.zeros((batch_size, a_size, N))
        action_next = np.zeros((batch_size, a_size, N))
        for i in range(batch_size):
            for j in range(a_size):
                for k in range(N):
                    action_now[i][j][k] = action[i][j]
                    action_next[i][j][k] = action_[i][j]

        q_target = sess.run(self.local_Q.q_action,
                            feed_dict={
                                self.local_Q.inputs:
                                np.vstack(next_observations),
                                self.local_Q.actions_q: action_next
                            })
        q_target_batch = []
        for i in range(len(q_target)):
            qi = q_target[i]  # * (1 - dones[i])
            z_target_step = []
            for j in range(len(qi)):
                z_target_step.append(gamma * qi[j] + rewards[i])
            q_target_batch.append(z_target_step)
        q_target_batch = np.array(q_target_batch)
        #print q_target_batch
        isweight = np.zeros((batch_size, N))
        for i in range(batch_size):
            for j in range(N):
                isweight[i, j] = ISWeights[i]
        feed_dict = {
            self.local_Q.inputs: np.vstack(observations),
            self.local_Q.actions_q: action_now,
            self.local_Q.q_target: q_target_batch,
            self.local_Q.ISWeights: isweight
        }
        u, l, g_n, v_n, _ = sess.run([
            self.local_Q.u, self.local_Q.loss, self.local_Q.grad_norms,
            self.local_Q.var_norms, self.local_Q.apply_grads
        ],
                                     feed_dict=feed_dict)
        return l / len(rollout), g_n, v_n, Q_target, u

    def work(self, gamma, sess, coord, saver):
        global GLOBAL_STEP
        episode_count = sess.run(self.global_episodes)
        total_steps = 0
        epsilon = 0.2

        print("Starting worker " + str(self.number))
        best_mean_episode_reward = -float('inf')
        with sess.as_default(), sess.graph.as_default():
            while not coord.should_stop():
                sess.run(self.update_local_ops)
                #episode_buffer = []
                episode_reward = 0
                episode_step_count = 0
                d = False
                s = self.env.reset()
                s = process_frame(s)
                if epsilon > 0.01:
                    epsilon = epsilon * 0.997
                while not d:
                    #self.env.render()
                    GLOBAL_STEP += 1
                    #Take an action using probabilities from policy network output.
                    if random.random() > epsilon:
                        a_dist_list = sess.run(
                            self.local_Q.Q,
                            feed_dict={self.local_Q.inputs: [s]})
                        a_dist = a_dist_list[0]
                        a = np.argmax(a_dist)
                    else:
                        a = random.randint(0, 5)

                    s1, r, d, _ = self.env.step(a)
                    if d == False:
                        s1 = process_frame(s1)
                    else:
                        s1 = s
                    self.replaymemory.add([s, a, r, s1, d])
                    episode_reward += r
                    s = s1
                    total_steps += 1
                    episode_step_count += 1
                    if total_steps % 2 == 0 and d != True and total_steps > 50000:
                        episode_buffer, tree_idx, ISWeights = self.replaymemory.sample(
                            batch_size)
                        l, g_n, v_n, Q_target, u = self.train(
                            episode_buffer, sess, gamma, ISWeights)
                        u = np.mean(u, axis=1) + 1e-6
                        self.replaymemory.update_priorities(tree_idx, u)
                        #sess.run(self.update_local_ops)
                    if d == True:
                        break
                sess.run(self.update_local_ops)
                self.episode_rewards.append(episode_reward)
                self.episode_lengths.append(episode_step_count)

                # Periodically save gifs of episodes, model parameters, and summary statistics.
                if episode_count % 5 == 0 and episode_count != 0 and total_steps > max_memory:
                    if self.name == 'worker_0' and episode_count % 5 == 0:
                        print('\n episode: ', episode_count, 'global_step:', \
                              GLOBAL_STEP, 'mean episode reward: ', np.mean(self.episode_rewards[-10:]), \
                              'epsilon: ', epsilon)

                    print('loss', l, 'Qtargetmean', np.mean(Q_target))
                    #print 'p_target', p_target
                    if episode_count % 100 == 0 and self.name == 'worker_0' and total_steps > 10000:
                        saver.save(
                            sess, self.model_path + '/qr-dqn-' +
                            str(episode_count) + '.cptk')
                        print("Saved Model")

                    mean_reward = np.mean(self.episode_rewards[-100:])
                    if episode_count > 20 and best_mean_episode_reward < mean_reward:
                        best_mean_episode_reward = mean_reward

                if self.name == 'worker_0':
                    sess.run(self.increment)
                    #if episode_count%1==0:
                    #print('\r {} {}'.format(episode_count, episode_reward),end=' ')
                episode_count += 1
Ejemplo n.º 5
0
class Worker():
    def __init__(self,env,name,s_size,a_size,trainer,model_path,global_episodes):
        self.name = "worker_" + str(name)
        self.number = name
        self.model_path = model_path
        self.trainer = trainer
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_mean_values = []
        #Create the local copy of the network and the tensorflow op to copy global paramters to local network
        self.local_Q = Q_Network(s_size, a_size, self.name, trainer)
        self.update_local_ops = update_target_graph('global', self.name)
        self.env = env
        self.replaymemory = ReplayMemory(max_memory)
        
    def train(self,rollout,sess,gamma,ISWeights):
        rollout = np.array(rollout)
        observations      = rollout[:,0]
        actions           = rollout[:,1]
        rewards           = rollout[:,2]
        next_observations = rollout[:,3]
        dones             = rollout[:,4]
        
        Q_target = sess.run(self.local_Q.Q, feed_dict={self.local_Q.inputs:np.vstack(next_observations)})
        actions_ = np.argmax(Q_target, axis=1)
        
        action = np.zeros((batch_size, a_size))
        action_ = np.zeros((batch_size, a_size))
        for i in range(batch_size):
            action[i][actions[i]] = 1
            action_[i][actions_[i]] = 1
        action_now = np.zeros((batch_size, a_size, N))
        action_next = np.zeros((batch_size, a_size, N))
        for i in range(batch_size):
            for j in range(a_size):
                for k in range(N):
                    action_now[i][j][k] = action[i][j]
                    action_next[i][j][k] = action_[i][j]

        q_target = sess.run(self.local_Q.q_action, feed_dict={self.local_Q.inputs:np.vstack(next_observations),
                                                               self.local_Q.actions_q:action_next})
        q_target_batch = []
        for i in range(len(q_target)):
            qi = q_target[i]# * (1 - dones[i])
            z_target_step = []
            for j in range(len(qi)):
                z_target_step.append(gamma * qi[j] + rewards[i])
            q_target_batch.append(z_target_step)
        q_target_batch = np.array(q_target_batch)
        #print q_target_batch
        isweight = np.zeros((batch_size,N))
        for i in range(batch_size):
            for j in range(N):
                isweight[i,j] = ISWeights[i]
        feed_dict = {self.local_Q.inputs:np.vstack(observations),
                     self.local_Q.actions_q:action_now,
                     self.local_Q.q_target:q_target_batch,
                     self.local_Q.ISWeights:isweight}
        u,l,g_n,v_n,_ = sess.run([self.local_Q.u,
                                  self.local_Q.loss,
                                  self.local_Q.grad_norms,
                                  self.local_Q.var_norms,
                                  self.local_Q.apply_grads],feed_dict=feed_dict)
        return l/len(rollout), g_n, v_n, Q_target, u

    def work(self,gamma,sess,coord,saver):
        global GLOBAL_STEP
        episode_count = sess.run(self.global_episodes)
        total_steps = 0
        epsilon = 0.2
        
        print ("Starting worker " + str(self.number))
        best_mean_episode_reward = -float('inf')
        with sess.as_default(), sess.graph.as_default():
            while not coord.should_stop():
                sess.run(self.update_local_ops)
                #episode_buffer = []
                episode_reward = 0
                episode_step_count = 0
                d = False
                s = self.env.reset()
                s = process_frame(s)
                if epsilon > 0.01:
                    epsilon = epsilon * 0.997
                while not d:
                    #self.env.render()
                    GLOBAL_STEP += 1
                    #Take an action using probabilities from policy network output.
                    if random.random() > epsilon:
                        a_dist_list = sess.run(self.local_Q.Q, feed_dict={self.local_Q.inputs:[s]})
                        a_dist = a_dist_list[0]
                        a = np.argmax(a_dist)
                    else:
                        a = random.randint(0, 5)
                    
                    s1, r, d, _ = self.env.step(a)
                    if d == False:
                        s1 = process_frame(s1)
                    else:
                        s1 = s
                    self.replaymemory.add([s,a,r,s1,d])
                    episode_reward += r
                    s = s1                    
                    total_steps += 1
                    episode_step_count += 1
                    if total_steps % 2 == 0 and d != True and total_steps > 50000:
                        episode_buffer, tree_idx, ISWeights = self.replaymemory.sample(batch_size)
                        l,g_n,v_n,Q_target,u = self.train(episode_buffer,sess,gamma,ISWeights)
                        u = np.mean(u,axis=1) + 1e-6
                        self.replaymemory.update_priorities(tree_idx,u)
                        #sess.run(self.update_local_ops)
                    if d == True:
                        break
                sess.run(self.update_local_ops)
                self.episode_rewards.append(episode_reward)
                self.episode_lengths.append(episode_step_count)

                # Periodically save gifs of episodes, model parameters, and summary statistics.
                if episode_count % 5 == 0 and episode_count != 0 and total_steps > max_memory:
                    if self.name == 'worker_0' and episode_count % 5 == 0:
                        print('\n episode: ', episode_count, 'global_step:', \
                              GLOBAL_STEP, 'mean episode reward: ', np.mean(self.episode_rewards[-10:]), \
                              'epsilon: ', epsilon)
                    
                    print ('loss', l, 'Qtargetmean', np.mean(Q_target))
                    #print 'p_target', p_target
                    if episode_count % 100 == 0 and self.name == 'worker_0' and total_steps > 10000:
                        saver.save(sess,self.model_path+'/qr-dqn-'+str(episode_count)+'.cptk')
                        print ("Saved Model")

                    mean_reward = np.mean(self.episode_rewards[-100:])
                    if episode_count > 20 and best_mean_episode_reward < mean_reward:
                        best_mean_episode_reward = mean_reward

                if self.name == 'worker_0':
                    sess.run(self.increment)
                    #if episode_count%1==0:
                        #print('\r {} {}'.format(episode_count, episode_reward),end=' ')
                episode_count += 1
Ejemplo n.º 6
0
class DQNAgent(GymAgent):
    """
    an agent for running the DQN algorithm (Minh et al 2013)
    """
    def __init__(self, env, mode, pre_trained_model, tensorboard_writer=None):
        super(DQNAgent, self).__init__(env, mode, tensorboard_writer)
        self.agent_name = 'DQN' + str(self.agent_no)
        self.memory = ReplayMemory()

        self.network = DeepQNetwork(self.obs_space[0], self.action_space)

        if self.mode == 'play':
            self.network.load_params(pre_trained_model)
            self.network.eval()

        elif self.mode == 'train':

            self.eval_network = DeepQNetwork(self.obs_space[0],
                                             self.action_space)
            self.eval_network.eval()

            if pre_trained_model:
                self.eval_network.load_params(pre_trained_model)

            self.optimizer = optim.RMSprop(self.network.parameters(), lr=LR)
            self.loss_func = SmoothL1Loss()
        else:
            raise ValueError(
                'Please set a valid mode for the agent (play or train)')

    def interact(self, state, action):
        """
        returns:
        state, reward, done, info
        """
        return self.env.step(action, state)

    def select_action(self, state):
        if self.mode == 'play':
            return self.network(prep_exploitation(state)).max(1)[1].view(1, 1)
        ##epsilon greedy policy
        eps_threshold = EPS_START * EPS_DECAY**self.no_training_steps if EPS_DECAY > EPS_END else EPS_END

        self.no_training_steps += 1

        if random.random() > eps_threshold:
            with torch.no_grad():
                return self.network(prep_exploitation(state)).max(1)[1].view(
                    1, 1)
        else:
            return prep_exploration(self.action_space)

    def optimize(self):
        sum_loss = 0

        if len(self.memory) < BATCH_SIZE:
            batch_size = len(self.memory)
        else:
            batch_size = BATCH_SIZE

        s, a, _s, r = prep_mem_batch(self.memory.sample(batch_size))

        non_final_next = torch.cat([sa for sa in _s if sa is not None])
        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, _s)))
        state_action_values = self.network(s).gather(1, a.long().unsqueeze(1))

        next_state_values = torch.zeros(batch_size)
        next_state_values[non_final_mask] = self.eval_network(
            non_final_next).detach().max(1)[0]

        expected_q = prep_q(next_state_values, r)
        loss = self.loss_func(state_action_values, expected_q.unsqueeze(1))

        self.optimizer.zero_grad()
        loss.backward()

        self.optimizer.step()

        return loss.item()

    def train(self, num_episodes, render=False, lr_decay=False):

        end_state = np.zeros(self.obs_space)
        state = end_state

        for episode in range(1, num_episodes + 1):
            done = False
            timesteps = 0
            rewards = []
            sum_rewards = []
            loss = 0
            times_alive = []

            while not done:
                if state is end_state:
                    state = self.env.initialize()

                if render: self.env.render()
                action = self.select_action(state)
                _state, reward, done, _ = self.interact(action.item(), state)
                rewards.append(reward)

                timesteps += 1

                if done:
                    _state = end_state

                    sum_reward = np.sum(rewards)
                    sum_rewards.append(sum_reward)

                    mean_loss = loss / timesteps
                    times_alive.append(timesteps)
                    timesteps = 0

                    if self.writer:
                        self.writer.add_scalar(
                            self.agent_name + 'duration of episode', timesteps,
                            episode)
                        self.writer.add_scalar(
                            self.agent_name + 'mean reward of episode',
                            sum_reward, episode)
                        self.writer.add_scalar(
                            self.agent_name + 'mean loss of episode',
                            mean_loss, episode)

                self.memory.push(state, action,
                                 _state if _state is not None else end_state,
                                 reward)

                state = _state
                episode_loss = self.optimize()
                loss += episode_loss

            if lr_decay:
                for g in self.optimizer.param_groups:
                    g['lr'] = g['lr'] / (1 + (episode / LR_DECAY))

            if episode % TARGET_UPDATE == 0:
                if self.env.goal(times_alive):
                    print('goal reached your computer is smart :)')
                    self.eval_network.save_params(self.agent_name,
                                                  self.env.env_name)
                    break
                else:
                    times_alive = []

                self.eval_network.update_params(self.network)
                print('episode ', episode, 'loss ', mean_loss, 'reward ',
                      np.mean(sum_rewards))
                #add your custom goals

    def play(self, num_episodes):
        for episode in range(1, num_episodes + 1):
            done = False
            state = self.env.initialize()
            while not done:
                self.env.render()
                action = self.select_action(state)
                _state, reward, done, _ = self.interact(action.item(), state)
                if done:
                    state = self.env.initialize()
Ejemplo n.º 7
0
class Agent:
    '''Interact with and learn from the environment.'''
    def __init__(self, state_size, action_size, seed, is_double_q=False):
        '''Initialize an Agent.

        Params
        ======
            state_size (int): the dimension of the state
            action_size (int): the number of actions
            seed (int): random seed
        '''

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.t_step = 0  # Initialize time step (for tracking LEARN_EVERY_STEP and UPDATE_EVERY_STEP)
        self.running_loss = 0
        self.training_cnt = 0

        self.is_double_q = is_double_q

        self.qnetwork_local = QNetwork(self.state_size, self.action_size,
                                       seed).to(device)
        self.qnetowrk_target = QNetwork(self.state_size, self.action_size,
                                        seed).to(device)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.replay_memory = ReplayMemory(BATCH_SIZE, BUFFER_SIZE, seed)

    def act(self, state, mode, epsilon=None):
        '''Returns actions for given state as per current policy.
        
        Params
        ======
            state (array-like): current state
            mode (string): train or test
            epsilon (float): for epsilon-greedy action selection

        '''
        state = torch.from_numpy(state).float().unsqueeze(0).to(
            device)  # shape of state (1, state)

        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local.forward(state)
        self.qnetwork_local.train()

        if mode == 'test':
            action = np.argmax(action_values.cpu().data.numpy()
                               )  # pull action values from gpu to local cpu

        elif mode == 'train':
            if random.random() <= epsilon:  # random action
                action = random.choice(np.arange(self.action_size))
            else:  # greedy action
                action = np.argmax(action_values.cpu().data.numpy(
                ))  # pull action values from gpu to local cpu

        return action

    def step(self, state, action, reward, next_state, done):
        # add new experience in memory
        self.replay_memory.add(state, action, reward, next_state, done)

        # activate learning every few steps
        self.t_step = self.t_step + 1
        if self.t_step % LEARN_EVERY_STEP == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.replay_memory) >= BUFFER_SIZE:
                experiences = self.replay_memory.sample(device)
                self.learn(experiences, GAMMA)

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor

        """

        # compute and minimize the loss
        states, actions, rewards, next_states, dones = experiences

        q_local_chosen_action_values = self.qnetwork_local.forward(
            states).gather(1, actions)
        q_target_action_values = self.qnetowrk_target.forward(
            next_states).detach()  # # detach from graph, don't backpropagate

        if self.is_double_q == True:
            q_local_next_actions = self.qnetwork_local.forward(
                next_states).detach().max(1)[1].unsqueeze(
                    1)  # shape (batch_size, 1)
            q_target_best_action_values = q_target_action_values.gather(
                1, q_local_next_actions)  # Double DQN

        elif self.is_double_q == False:
            q_target_best_action_values = q_target_action_values.max(
                1)[0].unsqueeze(1)  # shape (batch_size, 1)

        q_target_values = rewards + gamma * q_target_best_action_values * (
            1 - dones)  # zero value for terminal state

        td_errors = q_target_values - q_local_chosen_action_values

        loss = (td_errors**2).mean()

        self.running_loss += float(loss.cpu().data.numpy())
        self.training_cnt += 1

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        if self.t_step % UPDATE_EVERY_STEP == 0:
            self.update(self.qnetwork_local, self.qnetowrk_target)

    def update(self, local_netowrk, target_network):
        """Hard update model parameters, as indicated in original paper.
        
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
        """
        for local_param, target_param in zip(local_netowrk.parameters(),
                                             target_network.parameters()):
            target_param.data.copy_(local_param.data)
Ejemplo n.º 8
0
                writer.add_scalar('episode_return/episode', int(ep_return),
                                  int(ep))
            break

        global_t += 1
        ep_t += 1
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

        if global_t % N_TIMESTEPS_PER_UPDATE == 0 and len(
                replay_memory) > N_SAMPLES:

            # training loop
            # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

            for _ in range(N_EPOCHS):
                sample = replay_memory.sample(n_samples=N_SAMPLES,
                                              sample_length=SAMPLE_LENGTH)
                batch = episodes_to_batch(sample)

                values = value_net(batch.states)
                target_values = target_value_net(batch.states)
                shifted_values = torch.cat(
                    (target_values[:, 1:],
                     tensor(torch.zeros(target_values.shape[0], 1))),
                    dim=-1)

                deltas = (-values + batch.rewards +
                          GAMMA * shifted_values.detach())
                advantages = tensor_forward_sum(deltas, GAMMA * LAMBDA)

                value_net_loss = (advantages**2).mean()