def __init__(self, logits, beta=1.0, min_prob=0.0): self.distributions = [] self.beta = beta self.min_prob = min_prob self.logits = logits for logit in logits: self.distributions.append( SoftmaxDistribution(logit, beta, min_prob))
def compute_policy(self, h): h = F.average_pooling_2d(h, 3) h = h.reshape((len(h), 1, self.L_stages+12)) h = self.f(self.dbn1(self.dc1(h))) h = self.f(self.dbn2(self.dc2(h))) h = self.f(self.dbn3(self.dc3(h))) h = self.f(self.dbn4(self.dc4(h))) h = self.f(self.dbn5(self.dc5(h))) h = self.dc6(h) probs = [] acts = [] for i in range(self.action_size): p = SoftmaxDistribution(h[:, i, :]) a = p.sample() probs.append(p) acts.append(a) return probs, acts
def __call__(self, z): # decode location z1 = z h = F.reshape(z1, (1, 64, 2, 2)) h = self.f(self.l1_c1(h)) h = self.f(self.l1_c2(h)) h = self.f(self.l1_c3(h)) h = self.f(self.l1_c4(h)) h = self.l1_c5(h) h = F.expand_dims(F.flatten(h), 0) p1 = SoftmaxDistribution(h) a1 = p1.sample() # decode prob h_a1 = self.f(self.l1_l1( np.expand_dims(a1.data, 0).astype(np.float32))) h_a1 = F.concat((z1, h_a1), axis=1) z2 = self.f(self.l1_l2(h_a1)) h_a2 = self.l2_l1(z2) p2 = SoftmaxDistribution(h_a2) a2 = p2.sample() probs = [p1, p2] acts = [a1, a2] return probs, acts
def __call__(self, obs, conditional_input): o_c, o_a1, o_a2 = obs if self.conditional: # concat image obs and conditional image input o_c = F.concat((o_c, conditional_input), axis=1) h_a1 = self.f(self.e1_a1(o_a1)) h_a2 = self.f(self.e1_a2(o_a2)) h_a = F.concat((h_a1, h_a2), axis=1) h_c = self.f(self.e1_c1(o_c)) h = h_c + h_a h = self.f(self.e2_l1(h)) h = self.lstm(h) # decoder part z1 = h z1 = self.f(self.d_a1_l1(z1)) z1 = self.d_a1_l2(z1) p1 = SoftmaxDistribution(z1) z2 = h z2 = self.f(self.d_a2_l1(z2)) z2 = self.d_a2_l2(z2) p2 = SoftmaxDistribution(z2) probs = [p1, p2] acts = [p1.sample(), p2.sample()] return probs, acts
def __call__( self, states: Any, # (b, n_input_channel, ROW, COLUMN) ) -> Any: h = self.layer1(states) # (b, n_input_channel, ROW, COLUMN) h = self.layer2(h) h = self.layer3(h) h = self.layer4(h) h = self.layer5(h) h = self.layer6(h) h = self.layer7(h) h = self.layer8(h) h = self.layer9(h) policies = self.layer10(h) # (b, ROW * COLUMN + 1) valid_mask = self.valid_moves(states) policies += valid_mask # Categoricalは微分不可能なので注意 action_distribution = SoftmaxDistribution(policies) return action_distribution
def __call__(self, x): if self.action_wrapper == 'discrete': return SoftmaxDistribution(self.get_raw_value(x), min_prob=0.0) else: return ContinuousDeterministicDistribution(self.get_raw_value(x))