Exemple #1
0
 def __init__(self, logits, beta=1.0, min_prob=0.0):
     self.distributions = []
     self.beta = beta
     self.min_prob = min_prob
     self.logits = logits
     for logit in logits:
         self.distributions.append(
             SoftmaxDistribution(logit, beta, min_prob))
    def compute_policy(self, h):
        h = F.average_pooling_2d(h, 3)
        h = h.reshape((len(h), 1, self.L_stages+12))
        h = self.f(self.dbn1(self.dc1(h)))
        h = self.f(self.dbn2(self.dc2(h)))
        h = self.f(self.dbn3(self.dc3(h)))
        h = self.f(self.dbn4(self.dc4(h)))
        h = self.f(self.dbn5(self.dc5(h)))
        h = self.dc6(h)

        probs = []
        acts = []

        for i in range(self.action_size):
            p = SoftmaxDistribution(h[:, i, :])
            a = p.sample()
            probs.append(p)
            acts.append(a)

        return probs, acts
Exemple #3
0
    def __call__(self, z):
        # decode location
        z1 = z
        h = F.reshape(z1, (1, 64, 2, 2))
        h = self.f(self.l1_c1(h))
        h = self.f(self.l1_c2(h))
        h = self.f(self.l1_c3(h))
        h = self.f(self.l1_c4(h))
        h = self.l1_c5(h)
        h = F.expand_dims(F.flatten(h), 0)
        p1 = SoftmaxDistribution(h)
        a1 = p1.sample()

        # decode prob
        h_a1 = self.f(self.l1_l1(
            np.expand_dims(a1.data, 0).astype(np.float32)))
        h_a1 = F.concat((z1, h_a1), axis=1)

        z2 = self.f(self.l1_l2(h_a1))
        h_a2 = self.l2_l1(z2)
        p2 = SoftmaxDistribution(h_a2)
        a2 = p2.sample()

        probs = [p1, p2]
        acts = [a1, a2]

        return probs, acts
Exemple #4
0
    def __call__(self, obs, conditional_input):
        o_c, o_a1, o_a2 = obs

        if self.conditional:
            # concat image obs and conditional image input
            o_c = F.concat((o_c, conditional_input), axis=1)

        h_a1 = self.f(self.e1_a1(o_a1))
        h_a2 = self.f(self.e1_a2(o_a2))
        h_a = F.concat((h_a1, h_a2), axis=1)

        h_c = self.f(self.e1_c1(o_c))
        h = h_c + h_a
        h = self.f(self.e2_l1(h))
        h = self.lstm(h)

        # decoder part
        z1 = h
        z1 = self.f(self.d_a1_l1(z1))
        z1 = self.d_a1_l2(z1)
        p1 = SoftmaxDistribution(z1)

        z2 = h
        z2 = self.f(self.d_a2_l1(z2))
        z2 = self.d_a2_l2(z2)
        p2 = SoftmaxDistribution(z2)

        probs = [p1, p2]
        acts = [p1.sample(), p2.sample()]

        return probs, acts
 def __call__(
         self,
         states: Any,  # (b, n_input_channel, ROW, COLUMN)
 ) -> Any:
     h = self.layer1(states)  # (b, n_input_channel, ROW, COLUMN)
     h = self.layer2(h)
     h = self.layer3(h)
     h = self.layer4(h)
     h = self.layer5(h)
     h = self.layer6(h)
     h = self.layer7(h)
     h = self.layer8(h)
     h = self.layer9(h)
     policies = self.layer10(h)  # (b, ROW * COLUMN + 1)
     valid_mask = self.valid_moves(states)
     policies += valid_mask
     # Categoricalは微分不可能なので注意
     action_distribution = SoftmaxDistribution(policies)
     return action_distribution
Exemple #6
0
 def __call__(self, x):
     if self.action_wrapper == 'discrete':
         return SoftmaxDistribution(self.get_raw_value(x), min_prob=0.0)
     else:
         return ContinuousDeterministicDistribution(self.get_raw_value(x))