예제 #1
0
    def act(self, obs):
        with chainer.using_config('train', False), chainer.no_backprop_mode():
            action_value = self.model(
                self.batch_states([obs], self.xp, self.phi))

            embedding = self.model.embedding

            if len(self.value_buffer) > 0:
                q_np = self.value_buffer.compute_q(embedding=embedding)
                q_theta = action_value.q_values.array
                q = Variable((1 - self.lamda) * q_theta + self.lamda * q_np)
                q = DiscreteActionValue(q)
                q = float(q.max.array)

            else:
                q = float(action_value.max.array)

        action = cuda.to_cpu(action_value.greedy_actions.array)[0]

        # Update stats
        self.average_q *= self.average_q_decay
        self.average_q += (1 - self.average_q_decay) * q

        self.logger.debug('t:%s q:%s action_value:%s', self.t, q, action_value)

        self.eval_t += 1
        self._backup_if_necessary(self.eval_t, embedding)

        return action
예제 #2
0
 def __call__(self, x, eva=False, test=False):
     q = self.q_func(x)
     self.embedding = self.get_embedding()
     if not eva or self.lambdas == 0 or self.lambdas == 1:
         return q
     qnp = self.non_q.get_q(self.embedding.array)
     qout = self.lambdas * q.q_values + (1 - self.lambdas) * qnp
     return DiscreteActionValue(qout)
예제 #3
0
 def __call__(self, x, eva=False, test=False):
     q = self.q_func(x)
     self.embedding = self.get_embedding()
     if not eva or self.lambdas == 0 or self.lambdas == 1:
         return q
     #Output Q-value from value buffer
     qnp = self.non_q.get_q(self.embedding.array)
     #Q-value adjustment
     qout = self.lambdas * q.q_values + (1 - self.lambdas) * qnp
     return DiscreteActionValue(qout)
예제 #4
0
 def __call__(self, x, eva=False, test=False):
     """TODO: stateを受け取って, Q値を返す"""
     q = self.q_func(x)
     self.embedding = self.get_embedding()
     if not eva or self.lambdas == 0:
         return q
     # Output the non-Q from value buffer
     qnp = self.non_q.get_q(self.embedding.array)
     #Q-value adjustment
     qout = self.lambdas * q.q_values + (1 - self.lambdas) * qnp
     return DiscreteActionValue(qout)
예제 #5
0
 def forward(self, x):
     """Compute Q-values of actions for given observations."""
     x1 = x[:, 0, :, :].reshape((-1, 1, obs_size * 2 + 1, obs_size * 2 + 1))
     x2 = x[:, 1, :, :].reshape((-1, (obs_size * 2 + 1) ** 2))
     if x2.shape[0] == 1:
         x2 = np.tile(x2, (minibatch_size, 1))
     h = F.relu(self.bn1(self.conv1(x)))
     h = F.relu(self.bn2(self.conv2(x)))
     h = F.relu(self.bn3(self.conv3(x)))
     h = self.l(h)
     return DiscreteActionValue(h)
예제 #6
0
    def __call__(self, x, test=False):
        self.embedding = self.hout(x)
        activation = F.relu(self.embedding)
        batch_size = x.shape[0]
        ya = self.a_stream(activation)
        mean = F.reshape(F.sum(ya, axis=1) / self.num_actions, (batch_size, 1))
        ya, mean = F.broadcast(ya, mean)
        ya -= mean

        ys = self.v_stream(activation)
        ya, ys = F.broadcast(ya, ys)
        q = ya + ys
        return DiscreteActionValue(q)
예제 #7
0
 def __call__(self, x):
     h = self.model(x)
     return DiscreteActionValue(h)
예제 #8
0
 def __call__(self, x):
     h = F.relu(self.fc(x))
     h = self.lstm(h)
     return DiscreteActionValue(self.out(h))
예제 #9
0
 def __call__(self, x, test=False):
     h = F.relu(self.fc(x, test=test))
     h = self.lstm(h)
     return DiscreteActionValue(self.out(h))
예제 #10
0
 def __call__(self, x, test=False):
     h = self.model(x, test=test)
     return DiscreteActionValue(h)
예제 #11
0
 def __call__(self, x, test=False):
     self.embedding = self.hout(x)
     return DiscreteActionValue(self.qout(F.relu(self.embedding)))
예제 #12
0
    def __call__(self, x):
        if self.use_tuple:
            batch_size = x[0].shape[0]
            h = x[0]
        else:
            batch_size = x.shape[0]
            h = x

        for l in self.conv_layers:
            h = self.activation(l(h))

        if self.use_tuple:
            h = F.reshape(h, (batch_size, -1))

            # concatenate additional observations
            h = F.concat((h, x[1]))

        # Advantage
        a1 = self.a_branch_1(h)
        a2 = self.a_branch_2(h)
        a3 = self.a_branch_3(h)
        a4 = self.a_branch_4(h)
        if len(self.branch_sizes) > 4:
            a5 = self.a_branch_5(h)

        # Compute means for each branch
        mean_a1 = F.sum(a1, axis=1) / self.branch_sizes[0]
        mean_a1 = F.reshape(mean_a1, (batch_size, 1))
        mean_a1 = F.broadcast_to(mean_a1, a1.shape)

        mean_a2 = F.sum(a2, axis=1) / self.branch_sizes[1]
        mean_a2 = F.reshape(mean_a2, (batch_size, 1))
        mean_a2 = F.broadcast_to(mean_a2, a2.shape)

        mean_a3 = F.sum(a3, axis=1) / self.branch_sizes[2]
        mean_a3 = F.reshape(mean_a3, (batch_size, 1))
        mean_a3 = F.broadcast_to(mean_a3, a3.shape)

        mean_a4 = F.sum(a4, axis=1) / self.branch_sizes[3]
        mean_a4 = F.reshape(mean_a4, (batch_size, 1))
        mean_a4 = F.broadcast_to(mean_a4, a4.shape)

        if len(self.branch_sizes) > 4:
            mean_a5 = F.sum(a5, axis=1) / self.branch_sizes[4]
            mean_a5 = F.reshape(mean_a5, (batch_size, 1))
            mean_a5 = F.broadcast_to(mean_a5, a5.shape)

        # Broadcast state values
        v = self.v_stream(h)
        v1 = F.broadcast_to(v, a1.shape)
        v2 = F.broadcast_to(v, a2.shape)
        v3 = F.broadcast_to(v, a3.shape)
        v4 = F.broadcast_to(v, a4.shape)
        if len(self.branch_sizes) > 4:
            v5 = F.broadcast_to(v, a5.shape)

        # Q-values
        q1 = v1 + a1 - mean_a1
        q2 = v2 + a2 - mean_a2
        q3 = v3 + a3 - mean_a3
        q4 = v4 + a4 - mean_a4
        if len(self.branch_sizes) > 4:
            q5 = v5 + a5 - mean_a5

        branches = []
        branches.append(DiscreteActionValue(q1))
        branches.append(DiscreteActionValue(q2))
        branches.append(DiscreteActionValue(q3))
        branches.append(DiscreteActionValue(q4))
        if len(self.branch_sizes) > 4:
            branches.append(DiscreteActionValue(q5))

        return BranchedActionValue(branches)
예제 #13
0
    def act_and_train(self, obs, reward):

        with chainer.using_config('train', False), chainer.no_backprop_mode():
            action_value = self.model(
                #84*84を1*84*84にしている
                #この形で渡さないといけない
                #そのためにobsを[]に入れて次元を増やしている
                self.batch_states([obs], self.xp, self.phi))

            # embeddingの所得
            embedding = self.model.embedding

            #Q値を合わせる
            if len(self.value_buffer) > 0:
                q_np = self.value_buffer.compute_q(embedding=embedding)
                q_theta = action_value.q_values.array
                q = Variable(self.lamda * q_theta + (1 - self.lamda) * q_np)
                q = DiscreteActionValue(q)
                q = float(q.max.array)
                #q_mixed = (1-self.lamda)*q_theta + self.lamda*q_np
                #q = float(q_mixed.max())
                #greedy_action = cuda.to_cpu(q_mixed.argmax())

            else:
                q = float(action_value.max.array)

        greedy_action = cuda.to_cpu(action_value.greedy_actions.array)[0]

        # Update stats
        self.average_q *= self.average_q_decay
        self.average_q += (1 - self.average_q_decay) * q

        self.logger.debug('t:%s q:%s action_value:%s', self.t, q, action_value)

        action = self.explorer.select_action(self.t,
                                             lambda: greedy_action,
                                             action_value=action_value)
        self.t += 1

        # Update the target network
        if self.t % self.target_update_interval == 0:
            self.sync_target_network()

        if self.last_state is not None:
            assert self.last_action is not None
            assert self.last_embedding is not None
            # Add a transition to the replay buffer
            self.replay_buffer.add(state=self.last_state,
                                   action=self.last_action,
                                   reward=reward,
                                   embedding=self.last_embedding,
                                   next_state=obs,
                                   next_action=action,
                                   is_state_terminal=False)

        self._backup_if_necessary(self.t, embedding)

        self.last_state = obs
        self.last_action = action
        self.last_embedding = embedding

        self.replay_updater.update_if_necessary(self.t)

        self.logger.debug('t:%s r:%s a:%s', self.t, reward, action)

        return self.last_action