def _feature_repl(hs_flatten, pairs, ckeys, lengths): xp = chainer.cuda.get_array_module(hs_flatten) begins, ends = pairs.T begins_ = xp.asarray(begins) ends_ = xp.asarray(ends) ckeys_ = xp.asarray(ckeys) h_b = F.embed_id(begins_, hs_flatten) h_b_pre = F.embed_id(begins_ - 1, hs_flatten, ignore_label=-1) out_of_span = np.insert(lengths[:-1].cumsum(), 0, 0) - 1 is_out_of_span = np.isin(begins - 1, out_of_span) h_b_pre = F.where( xp.asarray(is_out_of_span)[:, None], xp.zeros_like(h_b_pre.data), h_b_pre) h_e = F.embed_id(ends_, hs_flatten) h_e_post = F.embed_id(ends_ + 1, hs_flatten, hs_flatten.shape[0]) out_of_span = lengths.cumsum() is_out_of_span = np.isin(ends + 1, out_of_span) h_e_post = F.where( xp.asarray(is_out_of_span)[:, None], xp.zeros_like(h_e_post.data), h_e_post) h_k_pre = F.embed_id(ckeys_ - 1, hs_flatten) h_k_post = F.embed_id(ckeys_ + 1, hs_flatten) repl1 = F.absolute(h_b_pre * (h_b - h_k_post)) repl2 = F.absolute(h_e_post * (h_e - h_k_pre)) return repl1, repl2
def __call__(self, y, y_label, c_pre, h_pre, train=True): # input word embedding e = F.tanh(self.ye(y)) e_l = F.tanh(self.le(y_label)) # LSTM c_tmp, h_tmp = F.lstm( c_pre, F.dropout(self.eh(F.concat( (e, e_l))), ratio=0.2, train=train) + self.hh(h_pre)) enable = chainer.Variable( chainer.Variable(y.data != -1).data.reshape(len(y), 1)) c_next = F.where(enable, c_tmp, c_pre) h_next = F.where(enable, h_tmp, h_pre) # output using at at = F.sigmoid(self.vt(h_next)) #print(at.data) pg_pre = self.wg(h_next) pg = pg_pre * F.broadcast_to( (1 - at), shape=(pg_pre.data.shape[0], pg_pre.data.shape[1])) pe_pre = self.we(h_next) pe = pe_pre * F.broadcast_to( at, shape=(pe_pre.data.shape[0], pe_pre.data.shape[1])) # broadcast を使わない ver. # pg = chainer.Variable(self.wg(h_next).data * (1 - at).data) # pe = chainer.Variable(self.we(h_next).data * at.data) return F.concat((pg, pe)), at, c_next, h_next
def __call__(self, w, train=True, dpratio=0.5): x = self.embed(w) self.maybe_init_state(len(x.data), x.data.dtype) for i in range(self.num_layers): if self.ignore_label is not None: enable = (x.data != 0) c = F.dropout(self.get_c(i), train=train, ratio=dpratio) h = F.dropout(self.get_h(i), train=train, ratio=dpratio) x = F.dropout(x, train=train, ratio=dpratio) c, h = self.get_l(i)(c, h, x) if self.ignore_label != None: self.set_c(i, F.where(enable, c, self.get_c(i))) self.set_h(i, F.where(enable, h, self.get_h(i))) else: self.set_c(i, c) self.set_h(i, h) x = self.get_h(i) x = F.dropout(x, train=train, ratio=dpratio) return self.hy(x)
def __call__(self, fp, y): mean_activation = F.mean(fp, axis=0) rho = 0.01 zero_array = chainer.Variable( numpy.zeros(mean_activation.shape, dtype=numpy.float32)) small_array = zero_array + 0.001 cond = (mean_activation.data != 0) cond = chainer.Variable(cond) mean_activation = F.where(cond, mean_activation, small_array) self.kl_div = rho * F.sum( F.where( cond, self.p * F.log(self.p / mean_activation) + (1 - self.p) * F.log( (1 - self.p) / (1 - mean_activation)), zero_array)) # sampling z eps = numpy.random.uniform(0.0, 1.0, fp.data.shape).astype(numpy.float32) eps = chainer.Variable(eps) if self.train == True: z = self.logistic_func(fp - eps) #z = fp else: z = fp h = F.relu(self.l1(z)) h = F.relu(self.l2(h)) h = self.l3(h) self.rec_loss = F.sigmoid_cross_entropy(h, y) self.accuracy = F.binary_accuracy(h, y) self.loss = self.rec_loss + self.kl_div return self.loss, self.accuracy
def __call__(self, x, mask): #h = self.c(x) - self.b self.m.W.data = self.xp.array(self.maskW) #mask windows are set by 1 h = self.c(x * mask) #(B,C,H,W) B, C, H, W = h.shape b = F.transpose(F.broadcast_to(self.c.b, (B, H, W, C)), (0, 3, 1, 2)) h = h - b mask_sums = self.m(mask) mask_new = (self.xp.sign(mask_sums.data - 0.5) + 1.0) * 0.5 mask_new_b = mask_new.astype("bool") mask_sums = F.where( mask_new_b, mask_sums, 0.01 * Variable(self.xp.ones(mask_sums.shape).astype("f"))) h = h / mask_sums + b mask_new = Variable(mask_new) h = F.where(mask_new_b, h, Variable(self.xp.zeros(h.shape).astype("f"))) #elif self.sample=="up": # h = F.unpooling_2d(x, 2, 2, 0, cover_all=False) # h = self.c(h) #else: # print("unknown sample method %s"%self.sample) if self.bn: h = self.batchnorm(h) if self.noise: h = add_noise(h) if self.dropout: h = F.dropout(h) if not self.activation is None: h = self.activation(h) return h, mask_new
def __call__(self, w, train=True, dpratio=0.2): x = self.embed(w) self.maybe_init_state(len(x.data), x.data.dtype) for i in range(self.num_layers): c = F.dropout(self.cs[i], train=train, ratio=dpratio) h = self.xhs[i](F.dropout(x, train=train, ratio=dpratio)) + self.hhs[i](F.dropout(self.hs[i], train=train, ratio=dpratio)) assert( c.data.shape == (len(x.data), self.hidden_size) ) assert( h.data.shape == (len(x.data), 4*self.hidden_size) ) c, h = F.lstm(c, h) assert( c.data.shape == (len(x.data), self.hidden_size) ) assert( h.data.shape == (len(x.data), self.hidden_size) ) if self.ignore_label != None: enable = (x.data != 0) self.cs[i] = F.where(enable, c , self.cs[i]) self.hs[i] = F.where(enable, h , self.hs[i]) else: self.cs[i] = c self.hs[i] = h x = self.hs[i] return self.hy(x)
def __call__(self, x, c_pre, h_pre, train=True): e = F.tanh(self.xe(x)) c_tmp, h_tmp = F.lstm(c_pre, self.eh(e) + self.hh(h_pre)) enable = chainer.Variable(chainer.Variable(x.data != -1).data.reshape(len(x), 1)) c_next = F.where(enable, c_tmp, c_pre) h_next = F.where(enable, h_tmp, h_pre) return c_next, h_next
def __call__(self, x, t): h = self.base(x, layers=['res5'])['res5'] self.cam = h h = _global_average_pooling_2d(h) ################################################################################ # ResNet50の後ろにArcFace実装 ################################################################################ # --------------------------- cos(theta) & phi(theta) --------------------------- cosine = F.linear(F.normalize(h), F.normalize(self.weight)) # fc8 sine = F.sqrt(F.clip((1.0 - F.square(cosine)),0, 1)) phi = cosine * cos_m - sine * sin_m if easy_margin: phi = F.where(cosine.data > 0, phi, cosine) else: phi = F.where(cosine.data > th, phi, cosine - mm) # --------------------------- convert label to one-hot --------------------------- one_hot = cp.eye(10)[t].astype(cp.float32) one_hot = Variable(one_hot) # -------------torch.where(out_i = {x_i if condition_i else y_i) ------------- output = (one_hot * phi) + ((1.0 - one_hot) * cosine) output *= s ################################################################################ #h = self.fc(h) return output
def _length_aware_softmax(e, l0, l1, xp): # e: (B, T0, T1) bs, t0, t1 = e.shape l0 = l0.reshape((bs, 1, 1)) l1 = l1.reshape((bs, 1, 1)) mask0 = (xp.tile(xp.arange(t0).reshape(1, t0, 1), (bs, 1, 1)) < l0).astype(e.dtype) mask1 = (xp.tile(xp.arange(t1).reshape(1, t1, 1), (bs, 1, 1)) < l1).astype(e.dtype) mask = (xp.matmul(mask0, mask1.swapaxes(1, 2))).astype(np.bool) # mask: (B, T0, T1) mask = chainer.Variable(mask) padding = chainer.Variable(xp.zeros(e.shape, dtype=e.dtype)) e_max = F.max(e, keepdims=True) e_masked = F.where(mask, e, padding) e_masked = e_masked - F.broadcast_to(e_max, e.shape) e_sum0 = F.reshape(F.logsumexp(e_masked, axis=1), (bs, 1, t1)) e_sum1 = F.reshape(F.logsumexp(e_masked, axis=2), (bs, t0, 1)) s1 = F.exp(e_masked - F.broadcast_to(e_sum0, e.shape)) s2 = F.exp(e_masked - F.broadcast_to(e_sum1, e.shape)) s1 = F.where(mask, s1, padding) s2 = F.where(mask, s2, padding) return s1, s2
def __call__(self, x, mask): self.m.W.data = self.xp.array(self.maskW) #mask windows are set by 1 h = self.c(x * mask) #(B,C,H,W) B, C, H, W = h.shape b = F.transpose(F.broadcast_to(self.c.b, (B, H, W, C)), (0, 3, 1, 2)) h = h - b mask_sums = self.m(mask) mask_new = (self.xp.sign(mask_sums.data - 0.5) + 1.0) * 0.5 mask_new_b = mask_new.astype("bool") mask_sums = F.where( mask_new_b, mask_sums, 0.01 * Variable(self.xp.ones(mask_sums.shape).astype("f"))) h = h / mask_sums + b mask_new = Variable(mask_new) h = F.where(mask_new_b, h, Variable(self.xp.zeros(h.shape).astype("f"))) if self.bn: h = self.batchnorm(h) if self.noise: h = add_noise(h) if self.dropout: h = F.dropout(h) if not self.activation is None: h = self.activation(h) return h, mask_new
def __call__(self, x, c_pre, h_pre, train=True): e = F.tanh(self.xe(x)) c_tmp, h_tmp = F.lstm(c_pre, self.eh(e) + self.hh(h_pre)) enable = chainer.Variable(chainer.Variable(x.data != -1).data.reshape(len(x), 1)) # calculate flg whether x is -1 or not c_next = F.where(enable, c_tmp, c_pre) # if x!=-1, c_tmp . elseif x=-1, c_pre. h_next = F.where(enable, h_tmp, h_pre) # if x!=-1, h_tmp . elseif x=-1, h_pre. return c_next, h_next
def __call__(self, embeded_x, m_prev, h_prev, feed_previous): lstm_in = F.dropout(self.W_e(embeded_x) + self.W_h(h_prev), ratio=self.dropout_ratio) m_tmp, h_tmp = F.lstm(m_prev, lstm_in) m = F.where(feed_previous, m_prev, m_tmp) h = F.where(feed_previous, h_prev, h_tmp) return m, h
def __call__(self, y, c_pre, h_pre, train=True): e = F.tanh(self.ye(y)) c_tmp, h_tmp = F.lstm(c_pre, F.dropout(self.eh(e), ratio=0.1, train=train) + self.hh(h_pre)) enable = chainer.Variable(chainer.Variable(y.data != -1).data.reshape(len(y), 1)) c_next = F.where(enable, c_tmp, c_pre) h_next = F.where(enable, h_tmp, h_pre) f = F.tanh(self.hf(h_next)) return self.fy(f), c_next, h_next
def __call__(self, x_block, y_in_block, y_out_block): batch = len(x_block) #embed ex_block = F.dropout(self.make_input_embedding(self.embed_x, x_block), self.dropout) ey_block = F.dropout( self.make_input_embedding(self.embed_y, y_in_block), self.dropout) eyy_block = F.dropout( self.make_input_embedding(self.embed_yy, y_in_block), self.dropout) eys = F.transpose(ey_block, (0, 2, 1)) eyys = F.transpose(eyy_block, (0, 2, 1)) #gcnn h = F.expand_dims(ex_block, axis=1) for i in range(self.stack): h = self.gcnn[i](h) h = F.dropout(F.squeeze(h, axis=1), self.dropout) #Nsteolstm eys2 = [i for i in eys] eyys2 = [i for i in eyys] _, _, oss = self.decoder(None, None, eys2) _, _, oss2 = self.decoder2(None, None, eyys2) ss = F.stack(oss, axis=0) ss2 = F.stack(oss2, axis=0) #mask_make mask = (y_in_block[:, :, None] >= 0) * self.xp.ones( (self.batch, 1, self.n_units), dtype=bool) ss = F.where(mask, ss, self.xp.full(ss.shape, 0, 'f')) #weight_calclate batch_A = F.batch_matmul(ss, h) * self.scale_score mask = (x_block[:, 0:len(x_block[0]) - self.stack * (self.width - 1)][:, None, :] >= 0) * (y_in_block[:, :, None] >= 0) batch_A = F.where(mask, batch_A, self.xp.full(batch_A.shape, -self.xp.inf, 'f')) batch_A = F.softmax(batch_A, axis=2) batch_A = F.where(self.xp.isnan(batch_A.data), self.xp.zeros(batch_A.shape, 'f'), batch_A) batch_A, h = F.broadcast(batch_A[:, None], h[:, :, None]) batch_C = F.sum(batch_A * h, axis=3) e = F.transpose(batch_C, (0, 2, 1)) e = F.squeeze(F.concat(F.split_axis(e, self.batch, axis=0), axis=1)) ss2 = F.squeeze(F.concat(F.split_axis(ss2, self.batch, axis=0), axis=1)) t = (self.We(e) + self.Ws(ss2)) t = F.dropout(t, self.dropout) concat_ys_out = F.concat(y_out_block, axis=0) loss = F.sum(F.softmax_cross_entropy(t, concat_ys_out, reduce='no')) / batch chainer.report({'loss': loss.data}, self) n_words = concat_ys_out.shape[0] perp = self.xp.exp(loss.data * batch / n_words) chainer.report({'perp': perp}, self) return loss
def __call__(self, y, c_pre, h_pre, hs_enc): e = F.tanh(self.ye(y)) c_tmp, h_tmp = F.lstm(c_pre, self.eh(e) + self.hh(h_pre)) enable = chainer.Variable(chainer.Variable(y.data != -1).data.reshape(len(y), 1)) c_next = F.where(enable, c_tmp, c_pre) h_next = F.where(enable, h_tmp, h_pre) ct = self.calculate_alpha(h_next, hs_enc) f = F.tanh(self.wc(ct) + self.wh(h_next)) return self.fy(f), c_next, h_next
def __call__(self, embeded_x, m_prev, h_prev, x): batch_size = embeded_x.shape[0] lstm_in = self.W(embeded_x) + self.U(h_prev) m_tmp, h_tmp = F.lstm(m_prev, lstm_in) # flags if feeding previous output feed_prev = F.broadcast_to(F.expand_dims(x.data != IGNORE_LABEL, -1), (batch_size, self.hidden_size)) m = F.where(feed_prev, m_tmp, m_prev) h = F.where(feed_prev, h_tmp, h_prev) return m, h
def __call__(self, x, enc_out=None, mask=None): """ args x: paralleled main features in the model Variable in (batch, hidden_dim, length) u: hidden features from Encoder Variable in (batch, hidden_dim, length) mask: padding-mask or future-mask xp-array in (batch, length, length) an element takes 'False' when pad/future, otherwise 'True' returns """ # ksize-1-convolution results in parallel linear projections if self.self_attention: qkv = F.squeeze(self.W(F.expand_dims(x, axis=3)), axis=3) query, key, value = F.split_axis(qkv, 3, axis=1) else: query = F.squeeze(self.W_Q(F.expand_dims(x, axis=3)), axis=3) kv = F.squeeze(self.W_KV(F.expand_dims(enc_out, axis=3)), axis=3) key, value = F.split_axis(kv, 2, axis=1) # make q,k,v into (batch*parallel, dim/parallel, length)shape query = F.concat(F.split_axis(query, self.parallel_num, axis=1), axis=0) key = F.concat(F.split_axis(key, self.parallel_num, axis=1), axis=0) value = F.concat(F.split_axis(value, self.parallel_num, axis=1), axis=0) mask = self.xp.concatenate([mask] * self.parallel_num, axis=0) attention_weight = F.batch_matmul(query, key, transa=True) * self.scale attention_weight = F.where( mask, attention_weight, self.xp.full(attention_weight.shape, -np.inf, dtype=np.float32)) attention_weight = F.softmax(attention_weight, axis=2) attention_weight = F.dropout(attention_weight, self.dropout_rate) attention_weight = F.where( self.xp.isnan(attention_weight.data), self.xp.full(attention_weight.shape, 0, dtype=np.float32), attention_weight) self.attention_weight = copy.deepcopy(attention_weight.data) # attention: (batch, q-length, k-length) -> (batch, 1, q-length, k-length) # value: (batch, dim/parallel, k-length) -> (batch, dim/parallel, 1, k-length) attention_weight, value = F.broadcast(attention_weight[:, None], value[:, :, None]) weighted_sum = F.sum(attention_weight * value, axis=3) weighted_sum = F.concat(F.split_axis(weighted_sum, self.parallel_num, axis=0), axis=1) weighted_sum = F.squeeze(self.linear( F.expand_dims(weighted_sum, axis=3)), axis=3) return weighted_sum
def step(self, y, embeded_y, m_prev, s_prev, batch_size): # decode once lstm_in = F.dropout(self.W_e(embeded_y) + self.W_s(s_prev), ratio=self.dropout_ratio) m_tmp, s_tmp = F.lstm(m_prev, lstm_in) feed_previous = F.broadcast_to( F.expand_dims(y.data == self.ignore_label, -1), (batch_size, self.decoder_hidden_size)) m = F.where(feed_previous, m_prev, m_tmp) s = F.where(feed_previous, s_prev, s_tmp) return m, s
def _log_ndtr(x): """Log CDF of the standard normal distribution. See https://github.com/scipy/scipy/blob/master/scipy/special/cephes/ndtr.c """ if not isinstance(x, chainer.Variable): x = chainer.Variable(x) return F.where( x.data > 6, -_ndtr(-x), F.where(x.data > -14, _safe_log(_ndtr(x)), -0.5 * x * x - _safe_log(-x) - 0.5 * np.log(2 * np.pi)))
def log_prob(self, x): unclipped_elementwise_log_prob = elementwise_gaussian_log_pdf( x, self.mean, self.var, self.ln_var) std = self.var**0.5 low_log_prob = _gaussian_log_cdf(self.low, self.mean, std) high_log_prob = _gaussian_log_sf(self.high, self.mean, std) x_data = _unwrap_variable(x) elementwise_log_prob = F.where( (x_data <= self.low.data), low_log_prob, F.where(x_data >= self.high.data, high_log_prob, unclipped_elementwise_log_prob)) return F.sum(elementwise_log_prob, axis=1)
def __call__(self, y, t, c_pre, h_pre, hs_enc, train=True): e = F.tanh(self.ye(y)) c_tmp, h_tmp = F.lstm(c_pre, F.dropout(self.eh(e), ratio=0.2, train=train) + self.hh(h_pre)) enable = chainer.Variable(chainer.Variable(y.data != -1).data.reshape(len(y), 1)) c_next = F.where(enable, c_tmp, c_pre) h_next = F.where(enable, h_tmp, h_pre) ct = self.calculate_alpha(h_next, hs_enc) f = F.tanh(self.wc(ct) + self.wh(h_next)) if train: return self.fy(f, t), c_next, h_next # return a loss value else: return self.test_out(f), c_next, h_next # return predict vectors
def batched_triangle_intersect_(p0, p1, p2, eps, fn, id, ro, rd, t0, t1): xp = chainer.backend.get_array_module(ro) BB = p0.shape[0] EB = p0.shape[0] _, _, H, W = ro.shape[:4] p0 = F.broadcast_to(p0.reshape((BB, 3, 1, 1)), (BB, 3, H, W)) p1 = F.broadcast_to(p1.reshape((BB, 3, 1, 1)), (BB, 3, H, W)) p2 = F.broadcast_to(p2.reshape((BB, 3, 1, 1)), (BB, 3, H, W)) fn = F.broadcast_to(fn.reshape((BB, 3, 1, 1)), (BB, 3, H, W)) id = F.broadcast_to(id.reshape((BB, 1, 1, 1)), (BB, 1, H, W)) eps = F.broadcast_to(eps.reshape((EB, 1, 1, 1)), (BB, 1, H, W)) ro = F.broadcast_to(ro.reshape((1, 3, H, W)), (BB, 3, H, W)) rd = F.broadcast_to(rd.reshape((1, 3, H, W)), (BB, 3, H, W)) t0 = F.broadcast_to(t0.reshape((1, 1, H, W)), (BB, 1, H, W)) t1 = F.broadcast_to(t1.reshape((1, 1, H, W)), (BB, 1, H, W)) aa = p0 - ro A = vdot(aa, fn) B = vdot(rd, fn) B = F.where(xp.abs(B.data) < eps.data, eps, B) #tx = F.where((xp.abs(A.data) < 1e-6)&(xp.abs(B.data) < 1e-6), t1, A / B) tx = F.maximum(t0, F.minimum(A / B, t1)) p = ro + tx * rd e0 = p0.data - p.data e1 = p1.data - p.data e2 = p2.data - p.data n01 = vcross_(e0, e1, xp) n12 = vcross_(e1, e2, xp) n20 = vcross_(e2, e0, xp) MASK_P = is_positive_(vdot_(n01, n12, xp)) MASK_Q = is_positive_(vdot_(n12, n20, xp)) MASK_R = is_positive_(vdot_(n20, n01, xp)) MASK_B = is_positive_(xp.abs(B.data)) #MASK_TN = is_positive(tx) MASK_T0 = is_positive_(tx.data - t0.data) MASK_T1 = is_positive_(t1.data - tx.data) b = MASK_P & MASK_Q & MASK_R & MASK_B & MASK_T0 & MASK_T1 t = F.where(b, tx, t1) p = ro + t * rd n = -xp.sign(vdot_(rd.data, fn.data, xp)) * fn return b, t, p, n, id
def _ndtr(a): """CDF of the standard normal distribution. See https://github.com/scipy/scipy/blob/master/scipy/special/cephes/ndtr.c """ if not isinstance(a, chainer.Variable): a = chainer.Variable(a) x = a * NPY_SQRT1_2 z = abs(x) half_erfc_z = 0.5 * F.erfc(z) return F.where(z.data < NPY_SQRT1_2, 0.5 + 0.5 * F.erf(x), F.where(x.data > 0, 1.0 - half_erfc_z, half_erfc_z))
def mixture_of_discretized_logistics_nll(x, y): """ Args: x: (b, c, n, n) y: (b, 10*n_mix, n, n) """ xp = get_array_module(x) n_mix = y.shape[1] // 10 logit_prob = y[:, :n_mix, :, :] y = F.reshape(y[:, n_mix:, :, :], x.shape + (n_mix * 3, )) mean = y[:, :, :, :, 0:n_mix] log_scale = y[:, :, :, :, n_mix:2 * n_mix] log_scale = F.maximum(log_scale, -7 * xp.ones(log_scale.shape, dtype='f')) coeff = F.tanh(y[:, :, :, :, 2 * n_mix:3 * n_mix]) x = xp.repeat(xp.expand_dims(x, 4), n_mix, 4) m1 = F.expand_dims(mean[:, 0, :, :, :], 1) m2 = F.expand_dims( mean[:, 1, :, :, :] + coeff[:, 0, :, :, :] * x[:, 0, :, :, :], 1) m3 = F.expand_dims( (mean[:, 2, :, :, :] + coeff[:, 1, :, :, :] * x[:, 0, :, :, :] + coeff[:, 2, :, :, :] * x[:, 1, :, :, :]), 1) mean = F.concat([m1, m2, m3]) centered_x = x - mean inv_std = F.exp(-log_scale) max_in = inv_std * (centered_x + 1. / 255.) cdf_max = F.sigmoid(max_in) min_in = inv_std * (centered_x - 1. / 255.) cdf_min = F.sigmoid(min_in) log_cdf_max = max_in - F.softplus(max_in) # 0 log_one_minus_cdf_min = -F.softplus(min_in) # 255 cdf_delta = cdf_max - cdf_min # 0 ~ 255 mid_in = inv_std * centered_x log_pdf_mid = mid_in - log_scale - 2. * F.softplus(mid_in) # mid log_prob = F.where( x < -0.999, log_cdf_max, F.where( x > 0.999, log_one_minus_cdf_min, F.where( cdf_delta.array > 1e-5, F.log( F.maximum(cdf_delta, xp.ones(cdf_delta.shape, dtype='f') * 1e-12)), log_pdf_mid - xp.log(127.5)))) log_prob = F.transpose(F.sum(log_prob, 1), (0, 3, 1, 2)) log_prob = log_prob + log_prob_from_logit(logit_prob) loss = F.logsumexp(log_prob, 1) loss = F.sum(loss, axis=(1, 2)) return -F.mean(loss)
def __call__(self, x, t, qt=None): # forward z = self.enc(x) e = self.vq(z) e_ = self.vq(chainer.Variable(z.data)) scale = t.shape[2] // e.shape[2] if self.quantize == 'mulaw': y_hat = self.dec(qt, F.unpooling_2d(e, (scale, 1), cover_all=False)) elif self.quantize == 'mixture': y_hat = self.dec(x, F.unpooling_2d(e, (scale, 1), cover_all=False)) # calculate loss if self.quantize == 'mulaw': loss1 = F.softmax_cross_entropy(y_hat, t) elif self.quantize == 'mixture': y_hat = y_hat[:, :30] logit_probs, means, log_scales = F.split_axis(y_hat, 3, 1) log_scales = F.relu(log_scales + 7) - 7 y = F.broadcast_to(t, means.shape) centered_y = y - means inv_stdv = F.exp(-log_scales) plus_in = inv_stdv * (centered_y + 1 / (2**16)) cdf_plus = F.sigmoid(plus_in) min_in = inv_stdv * (centered_y - 1 / (2**16)) cdf_min = F.sigmoid(min_in) log_cdf_plus = plus_in - F.softplus(plus_in) log_one_minus_cdf_min = -F.softplus(min_in) cdf_delta = cdf_plus - cdf_min cdf_delta = F.relu(cdf_delta - 1e-12) + 1e-12 y = F.broadcast_to(t, log_cdf_plus.shape).array log_probs = F.where( y < -0.999, log_cdf_plus, F.where(y > 0.999, log_one_minus_cdf_min, F.log(cdf_delta))) log_probs = log_probs + F.log_softmax(logit_probs) loss1 = -F.mean(log_probs) loss2 = F.mean((chainer.Variable(z.data) - e_)**2) loss3 = self.beta * F.mean((z - chainer.Variable(e.data))**2) loss = loss1 + loss2 + loss3 chainer.reporter.report( { 'loss1': loss1, 'loss2': loss2, 'loss3': loss3, 'loss': loss }, self) return loss1, loss2, loss3
def __call__(self, x): """Updates the internal state and returns the LSTM outputs. Args: x (~chainer.Variable): A new batch from the input sequence. Returns: ~chainer.Variable: Outputs of updated LSTM units. """ if self.upward.has_uninitialized_params: in_size = x.size // x.shape[0] self.upward._initialize_params(in_size) self._initialize_params() batch = x.shape[0] lstm_in = self.upward(x) h_rest = None if self.h is not None: h_size = self.h.shape[0] if batch == 0: h_rest = self.h elif h_size < batch: msg = ('The batch size of x must be equal to or less than the ' 'size of the previous state h.') raise TypeError(msg) elif h_size > batch: h_update, h_rest = split_axis.split_axis(self.h, [batch], axis=0) lstm_in += self.lateral(h_update) else: lstm_in += self.lateral(self.h) if self.c is None: xp = self.xp self.c = variable.Variable(xp.zeros((batch, self.state_size), dtype=x.dtype), volatile='auto') # self.c, y = lstm.lstm(self.c, lstm_in) c, y = lstm.lstm(self.c, lstm_in) enable = (x.data != -1) self.c = where(enable, c, self.c) if self.h is not None: y = where(enable, y, self.h) if h_rest is None: self.h = y elif len(y.data) == 0: self.h = h_rest else: self.h = concat.concat([y, h_rest], axis=0) return y
def __call__(self, x, z, mask): # split version # little slow # TODO: shape check """ Input shapes: q=(b, units, n_querys), k=(b, units, n_keys), m=(b, n_querys, n_keys) """ query = seq_func(self.W_Q, x) key = seq_func(self.W_K, z) value = seq_func(self.W_V, z) batch, n_units, n_querys = query.shape n_keys = key.shape[-1] children_query = F.split_axis(query, self.h, axis=1) # [(b, n_units // h, n_querys), ...] children_key = F.split_axis(key, self.h, axis=1) # [(b, n_units // h, n_keys), ...] children_value = F.split_axis(value, self.h, axis=1) # [(b, n_units // h, n_keys), ...] c_list = [] for q, k, v in zip(children_query, children_key, children_value): pre_a = F.batch_matmul(q, k, transa=True) # (b, n_querys, n_keys) pre_a /= (n_units // self.h)**0.5 minfs = self.xp.full(pre_a.shape, -np.inf, pre_a.dtype) pre_a = F.where(mask, pre_a, minfs) a = F.softmax(pre_a, axis=2) # if values in axis=2 are all -inf, they become nan. # thus do re-mask. a = F.where(self.xp.isnan(a.data), self.xp.zeros(a.shape, dtype=a.dtype), a) a = F.dropout(a, ratio=self.dropout) # (b, n_querys, n_keys) v = F.broadcast_to(v[:, :, None], (batch, n_units // self.h, n_querys, n_keys)) # (b, n_units // h, n_querys, n_keys) a = F.broadcast_to(a[:, None], (batch, n_units // self.h, n_querys, n_keys)) # (b, n_units // h, n_querys, n_keys) pre_c = a * v c = F.sum(pre_c, axis=3) # (b, units // h, n_querys) c_list.append(c) c = F.concat(c_list, axis=1) return c
def __call__(self, y, m_prev, s_prev, h_forward, h_backword, enable, disable_value): # m is memory cell of lstm, s is previous hidden output # calculate attention c = self._attention(h_forward, h_backword, s_prev, enable, disable_value) # decode once embeded_y = self.E(y) batch_size = y.shape[0] lstm_in = self.W(embeded_y) + self.U(s_prev) + self.C(c) m_tmp, s_tmp = F.lstm(m_prev, lstm_in) feed_prev = F.broadcast_to(F.expand_dims(y.data != IGNORE_LABEL, -1), (batch_size, self.hidden_size)) m = F.where(feed_prev, m_tmp, m_prev) s = F.where(feed_prev, s_tmp, s_prev) t = self.U_o(s) + self.V_o(embeded_y) + self.C_o(c) return self.W_o(t), m, s
def wsd_with_tc(self, sent, trf_encoded_matrix, labels): ### WSD ### if self.model_type == "TRF-Multi" or self.model_type == "TRF-Delay-Multi": y_wsd = self.wsd_only(trf_encoded_matrix, labels) elif self.model_type == "TRF-Sequential": y_wsd, task_type = self.wsd_model(sent, None, None, True) ## 読み込みsequential y_wsd_soft = F.softmax(y_wsd) ## 予測結果にSoftmaxをかける argmax_wsd = F.argmax(y_wsd_soft, axis=1) ## 最大のインデクス値を取ってくる cond = chainer.Variable( self.xp.array([ True if i != "<PAD>" else False for i in list(chain(*labels)) ])) ## 語義のラベルがついていない単語は無視するための条件 pad_array = chainer.Variable( -1 * self.xp.ones(argmax_wsd.shape, dtype=argmax_wsd.dtype)) pad_array_argmax_wsd = F.where(cond, argmax_wsd, pad_array) sense_label_embed = F.embed_id(x=pad_array_argmax_wsd, W=self.xp.array( self.lookup_table_sense_fixed), ignore_label=-1) ## 固定. sense_label_embed = sense_label_embed.reshape( trf_encoded_matrix.shape[0], trf_encoded_matrix.shape[-1], -1) origin_shape = sense_label_embed.shape sense_label_embed = F.moveaxis(sense_label_embed, 1, 2) ## 置き換え ## cond_reshape = cond.reshape(cond.shape[0], -1) cond_reshape = F.broadcast_to( cond_reshape, (cond_reshape.shape[0], trf_encoded_matrix.shape[1])) cond_reshape = cond_reshape.reshape(origin_shape) cond_reshape = F.swapaxes(cond_reshape, 1, 2) replaced_trf_matrix = F.where(cond_reshape, sense_label_embed, trf_encoded_matrix) ### WSDの予測をTCに組み入れる ### tc = replaced_trf_matrix ## 置換後の文書行列 ### TC ### tc_features = F.sum(tc, axis=2) ## TC特徴 y_tc = self.fc2(tc_features) ### TCの予測結果 return (y_tc, y_wsd) if (self.model_type == "TRF-Multi") or ( self.model_type == "TRF-Delay-Multi") else y_tc
def compute_context_vector(self, batches=True): xp = cuda.cupy if self.gpuid >= 0 else np batch_size, n_units = self[self.lstm_dec[-1]].h.shape # attention weights for the hidden states of each word in the input list if batches: # masking pad ids for attention weights = F.batch_matmul(self.enc_states, self[self.lstm_dec[-1]].h) weights = F.where(self.mask, weights, self.minf) alphas = F.softmax(weights) # compute context vector cv = F.reshape(F.batch_matmul(F.swapaxes(self.enc_states, 2, 1), alphas), shape=(batch_size, n_units)) else: # without batches alphas = F.softmax( F.matmul(self[self.lstm_dec[-1]].h, self.enc_states, transb=True)) # compute context vector if self.attn == SOFT_ATTN: cv = F.batch_matmul(self.enc_states, F.transpose(alphas)) cv = F.transpose(F.sum(cv, axis=0)) else: print("nothing to see here ...") return cv, alphas
def __call__(self, h, adj, deg_conds): # h (minibatch, atom, ch) # h encodes each atom's info in ch axis of size hidden_dim # adjs (minibatch, atom, atom) # --- Message part --- # Take sum along adjacent atoms # fv (minibatch, atom, ch) fv = chainer_chemistry.functions.matmul(adj, h) # --- Update part --- # s0, s1, s2 = fv.shape if self.xp is numpy: zero_array = numpy.zeros(fv.shape, dtype=numpy.float32) else: zero_array = self.xp.zeros_like(fv) fvds = [functions.where(cond, fv, zero_array) for cond in deg_conds] out_h = 0 for graph_linear, fvd in zip(self.graph_linears, fvds): out_h = out_h + graph_linear(fvd) # out_x shape (minibatch, max_num_atoms, hidden_dim) out_h = functions.sigmoid(out_h) return out_h
def kld(self, vec_true, vec_compare): ind = vec_true.data * vec_compare.data > 0 ind_var = chainer.Variable(ind) include_nan = vec_true * F.log(vec_true / vec_compare) z = chainer.Variable(np.zeros((len(ind), 1), dtype=np.float32)) # return np.nansum(vec_true * np.log(vec_true / vec_compare)) return F.sum(F.where(ind_var, include_nan, z))
def calc_attention(self, xs, ys, genre_exs, gender_exs, attn_linear): concat_ys = F.concat( ys, axis=0) # -> (total len of batched sentence, word embedding dim) attn_ys = attn_linear(F.tanh(concat_ys)) cond_feature = self.proj_cond(F.concat( (genre_exs, gender_exs))) # -> (batchsize, proj_cond dim) cumsum_ys = self.xp.cumsum( self.xp.array([len(x) for x in xs], dtype=self.xp.int32)) split_attn_ys = F.split_axis(attn_ys, cumsum_ys[:-1].tolist(), axis=0) split_attn_ys_pad = F.pad_sequence(split_attn_ys, padding=-1024) bool_cond = split_attn_ys_pad.array == -1024 split_attn_ys_pad = split_attn_ys_pad * F.expand_dims(F.broadcast_to( cond_feature, (split_attn_ys_pad.shape[:-1])), axis=-1) padding_array = self.xp.full(split_attn_ys_pad.shape, -1024, dtype=self.xp.float32) split_attn_ys_pad = F.where(bool_cond, padding_array, split_attn_ys_pad) attn_softmax = F.softmax(split_attn_ys_pad, axis=1) return attn_softmax
def forward(self, e_var, s_var=None, mask=None, batch=1): """Core function of the Multi-head attention layer. Args: e_var (chainer.Variable): Variable of input array. s_var (chainer.Variable): Variable of source array from encoder. mask (chainer.Variable): Attention mask. batch (int): Batch size. Returns: chainer.Variable: Outout of multi-head attention layer. """ xp = self.xp if s_var is None: # batch, head, time1/2, d_k) Q = self.linear_q(e_var).reshape(batch, -1, self.h, self.d_k) K = self.linear_k(e_var).reshape(batch, -1, self.h, self.d_k) V = self.linear_v(e_var).reshape(batch, -1, self.h, self.d_k) else: Q = self.linear_q(e_var).reshape(batch, -1, self.h, self.d_k) K = self.linear_k(s_var).reshape(batch, -1, self.h, self.d_k) V = self.linear_v(s_var).reshape(batch, -1, self.h, self.d_k) scores = F.matmul(F.swapaxes(Q, 1, 2), K.transpose( 0, 2, 3, 1)) / np.sqrt(self.d_k) if mask is not None: mask = xp.stack([mask] * self.h, axis=1) scores = F.where(mask, scores, xp.full(scores.shape, MIN_VALUE, "f")) self.attn = F.softmax(scores, axis=-1) p_attn = F.dropout(self.attn, self.dropout) x = F.matmul(p_attn, F.swapaxes(V, 1, 2)) x = F.swapaxes(x, 1, 2).reshape(-1, self.h * self.d_k) return self.linear_out(x)
def __call__(self, hx, cx, xs, enc_hs): xs_embed = [self.embed(x) for x in xs] hy, cy, ys = self.Nlstm(hx, cx, xs_embed) ys_pad = F.pad_sequence(ys, length=None, padding=0.0) enc_hs = F.pad_sequence(enc_hs, length=None, padding=0.0) mask = self.xp.all(enc_hs.data == 0, axis=2, keepdims=True) mask_num = self.xp.full(mask.shape, -1024.0, dtype=self.xp.float32) alignment = [] decode = [] ys_pad = F.transpose(ys_pad, (1, 0, 2)) for y in ys_pad: y = F.reshape(y, (*y.shape, 1)) score = F.matmul(enc_hs, y) score = F.where(mask, mask_num, score) align = F.softmax(score, axis=1) context_vector = F.matmul(enc_hs, align, True, False) t = self.W_c( F.dropout(F.concat((y, context_vector), axis=1), self.dropout)) ys_proj = self.proj(F.dropout(t, self.dropout)) alignment.append(F.reshape(align, (len(xs), -1))) decode.append(ys_proj) decode = F.stack(decode, axis=1) alignment = F.stack(alignment, axis=1) return hy, cy, decode, alignment.data
def __call__(self, h, adj, deg_conds): # h: (minibatch, atom, ch) # h encodes each atom's info in ch axis of size hidden_dim # adjs: (minibatch, atom, atom) # --- Message part --- # Take sum along adjacent atoms # fv: (minibatch, atom, ch) fv = chainer_chemistry.functions.matmul(adj, h) # --- Update part --- if self.xp is numpy: zero_array = numpy.zeros(fv.shape, dtype=numpy.float32) else: zero_array = self.xp.zeros_like(fv) fvds = [functions.where(cond, fv, zero_array) for cond in deg_conds] out_h = 0 for graph_linear, fvd in zip(self.graph_linears, fvds): out_h = out_h + graph_linear(fvd) # out_h shape (minibatch, max_num_atoms, hidden_dim) out_h = functions.sigmoid(out_h) return out_h
def __call__(self, x, condition=None): lstm_in = self.upward(x) if self.h is not None: lstm_in += self.lateral(self.h) if self.c is None: xp = self.xp self.c = Variable(xp.zeros((len(x.data), self.state_size), dtype=x.data.dtype), volatile="auto") if condition is None: self.c, self.h = F.lstm(self.c, lstm_in) else: c, h = F.lstm(self.c, lstm_in) if self.h is None: self.h = h self.c = c else: self.h = F.where(condition, h, self.h) self.c = F.where(condition, c, self.c) return self.h
def f(x, rois, roi_indices): y = functions.roi_max_align_2d( x, rois, roi_indices, outsize=self.outsize, spatial_scale=self.spatial_scale, sampling_ratio=self.sampling_ratio) xp = cuda.get_array_module(y) y = functions.where( xp.isinf(y.array), xp.zeros(y.shape, dtype=y.dtype), y) return y
def __accuracy(self, y, t): xp = self.xp b, c, n = y.data.shape v = np.arange(c, dtype=np.float32).reshape((1, -1, 1)).repeat(b, axis=0).repeat(n, axis=2) v = Variable(xp.asarray(v), volatile=True) r = F.sum(v * F.softmax(Variable(y.data, volatile=True)), axis=1) c = Variable(t.data >= 0, volatile=True) t = Variable(t.data.astype(np.float32), volatile=True) r = F.where(c, r, t) return F.sum(((r - t) * self.rating_unit) ** 2)
def check_forward(self, c_data, x_data, y_data): c = chainer.Variable(c_data) x = chainer.Variable(x_data) y = chainer.Variable(y_data) z = functions.where(c, x, y) xp = c.xp z_data_expected = xp.where(c_data, x_data, y_data) testing.assert_allclose(z.array, z_data_expected)
def check_forward(self, c_data, x_data, y_data): c = chainer.Variable(c_data) x = chainer.Variable(x_data) y = chainer.Variable(y_data) z = functions.where(c, x, y) self.assertEqual(x.data.shape, z.data.shape) for i in numpy.ndindex(c.data.shape): if c.data[i]: self.assertEqual(x.data[i], z.data[i]) else: self.assertEqual(y.data[i], z.data[i])
def check_forward(self, c_data, x_data, y_data): c = chainer.Variable(c_data) x = chainer.Variable(x_data) y = chainer.Variable(y_data) z = F.where(c, x, y) self.assertEqual(x.data.shape, z.data.shape) for c, x, y, z in zip(c.data.flatten(), x.data.flatten(), y.data.flatten(), z.data.flatten()): if c: self.assertEqual(x, z) else: self.assertEqual(y, z)
def __call__(self, x, condition=None): if self.h is None: z_t = sgu.hard_sigmoid(self.W_xz(x)) h_t = z_t * 0.5 else: h_t = sgu.DSGU.__call__(self, self.h, x) if condition is None: self.h = h_t else: if self.h is None: self.h = h_t else: self.h = F.where(condition, h_t, self.h) return h_t
def _attention(self, h_forward, h_backword, s, enable, disable_value): batch_size = s.shape[0] sentence_size = len(h_forward) hidden_size = self.hidden_size xp = self.xp weighted_s = F.broadcast_to(F.expand_dims(self.W_a(s), axis=1), (batch_size, sentence_size, hidden_size)) h = F.concat((F.concat(h_forward, axis=0), F.concat(h_backword, axis=0))) weighted_h = F.reshape(self.U_a(h), (batch_size, sentence_size, hidden_size)) e = self.v_a(F.reshape(F.tanh(weighted_s + weighted_h), (batch_size * sentence_size, hidden_size))) e = F.where(enable, F.reshape(e, (batch_size, sentence_size)), disable_value) alpha = F.softmax(e) c = F.batch_matmul(F.reshape(h, (batch_size, 2 * hidden_size, sentence_size)), alpha) return F.reshape(c, (batch_size, 2 * hidden_size))
def check_backward(self, c_data, x_data, y_data, g_data): c = chainer.Variable(c_data) x = chainer.Variable(x_data) y = chainer.Variable(y_data) z = F.where(c, x, y) z.grad = g_data z.backward() func = z.creator f = lambda: func.forward((c.data, x.data, y.data)) gx, gy = gradient_check.numerical_grad(f, (x_data, y.data), (g_data,)) gradient_check.assert_allclose(gx, x.grad) gradient_check.assert_allclose(gy, y.grad) self.assertIs(c.grad, None)
def __call__(self, x, condition=None): z = self.W_z(x) h_bar = self.W(x) if self.h is not None: r = F.sigmoid(self.W_r(x) + self.U_r(self.h)) z += self.U_z(self.h) h_bar += self.U(r * self.h) z = F.sigmoid(z) h_bar = F.tanh(h_bar) h_new = z * h_bar if self.h is not None: h_new += (1 - z) * self.h if condition is None: self.h = h_new else: if self.h is None: self.h = h_new else: self.h = F.where(condition, h_new, self.h) return self.h
def __call__(self, h, adj): xp = self.xp # (minibatch, atom, channel) mb, atom, ch = h.shape # (minibatch, atom, EDGE_TYPE * heads * out_dim) h = self.message_layer(h) # (minibatch, atom, EDGE_TYPE, heads, out_dim) h = functions.reshape(h, (mb, atom, self.n_edge_types, self.n_heads, self.out_channels)) # concat all pairs of atom # (minibatch, 1, atom, heads, out_dim) h_i = functions.reshape(h, (mb, 1, atom, self.n_edge_types, self.n_heads, self.out_channels)) # (minibatch, atom, atom, heads, out_dim) h_i = functions.broadcast_to(h_i, (mb, atom, atom, self.n_edge_types, self.n_heads, self.out_channels)) # (minibatch, atom, 1, EDGE_TYPE, heads, out_dim) h_j = functions.reshape(h, (mb, atom, 1, self.n_edge_types, self.n_heads, self.out_channels)) # (minibatch, atom, atom, EDGE_TYPE, heads, out_dim) h_j = functions.broadcast_to(h_j, (mb, atom, atom, self.n_edge_types, self.n_heads, self.out_channels)) # (minibatch, atom, atom, EDGE_TYPE, heads, out_dim * 2) e = functions.concat([h_i, h_j], axis=5) # (minibatch, EDGE_TYPE, heads, atom, atom, out_dim * 2) e = functions.transpose(e, (0, 3, 4, 1, 2, 5)) # (minibatch * EDGE_TYPE * heads, atom * atom, out_dim * 2) e = functions.reshape(e, (mb * self.n_edge_types * self.n_heads, atom * atom, self.out_channels * 2)) # (minibatch * EDGE_TYPE * heads, atom * atom, 1) e = self.attention_layer(e) # (minibatch, EDGE_TYPE, heads, atom, atom) e = functions.reshape(e, (mb, self.n_edge_types, self.n_heads, atom, atom)) e = functions.leaky_relu(e, self.negative_slope) # (minibatch, EDGE_TYPE, atom, atom) if isinstance(adj, chainer.Variable): cond = adj.array.astype(xp.bool) else: cond = adj.astype(xp.bool) # (minibatch, EDGE_TYPE, 1, atom, atom) cond = xp.reshape(cond, (mb, self.n_edge_types, 1, atom, atom)) # (minibatch, EDGE_TYPE, heads, atom, atom) cond = xp.broadcast_to(cond, e.array.shape) # TODO(mottodora): find better way to ignore non connected e = functions.where(cond, e, xp.broadcast_to(xp.array(-10000), e.array.shape) .astype(xp.float32)) # In Relational Graph Attention Networks eq.(7) # ARGAT: take the softmax over the logits across node neighborhoods # irrespective of relation if self.softmax_mode == 'across': # (minibatch, heads, atom, EDGE_TYPE, atom) e = functions.transpose(e, (0, 2, 3, 1, 4)) # (minibatch, heads, atom, EDGE_TYPE * atom) e = functions.reshape(e, (mb, self.n_heads, atom, self.n_edge_types * atom)) # (minibatch, heads, atom, EDGE_TYPE * atom) alpha = functions.softmax(e, axis=3) if self.dropout_ratio >= 0: alpha = functions.dropout(alpha, ratio=self.dropout_ratio) # (minibatch, heads, atom, EDGE_TYPE, atom) alpha = functions.reshape(alpha, (mb, self.n_heads, atom, self.n_edge_types, atom)) # (minibatch, EDGE_TYPE, heads, atom, atom) alpha = functions.transpose(alpha, (0, 3, 1, 2, 4)) # In Relational Graph Attention Networks eq.(6) # WIRGAT: take the softmax over the logits independently for each # relation elif self.softmax_mode == 'within': alpha = functions.softmax(e, axis=4) if self.dropout_ratio >= 0: alpha = functions.dropout(alpha, ratio=self.dropout_ratio) else: raise ValueError("{} is invalid. Please use 'across' or 'within'" .format(self.softmax_mode)) # before: (minibatch, atom, EDGE_TYPE, heads, out_dim) # after: (minibatch, EDGE_TYPE, heads, atom, out_dim) h = functions.transpose(h, (0, 2, 3, 1, 4)) # (minibatch, EDGE_TYPE, heads, atom, out_dim) h_new = functions.matmul(alpha, h) # (minibatch, heads, atom, out_dim) h_new = functions.sum(h_new, axis=1) if self.concat_heads: # (heads, minibatch, atom, out_dim) h_new = functions.transpose(h_new, (1, 0, 2, 3)) # (minibatch, atom, heads * out_dim) h_new = functions.concat(h_new, axis=2) else: # (minibatch, atom, out_dim) h_new = functions.mean(h_new, axis=1) return h_new
def forward(self, inputs, devices): c, x, y = inputs z = functions.where(c, x, y) return z,
def forward(src_sentence, trg_sentence, model, training=True): end = out_dim # 単語IDへの変換(自分で適当に実装する) # 正解の翻訳には終端記号を追加しておく。 #src_sentence = [convert_to_your_src_id(word) for word in src_sentence] #trg_sentence = [convert_to_your_trg_id(word) for wprd in trg_sentence] # LSTM内部状態の初期値 c_prev = Variable(np.zeros((10, HIDDEN_SIZE), dtype=np.float32)) p_prev = Variable(np.zeros((10, HIDDEN_SIZE), dtype=np.float32)) i = Variable(np.zeros((10, SRC_EMBED_SIZE), dtype=np.float32)) # エンコーダ for word in reversed(src_sentence): word = np.array(word,dtype=np.int32) word = word.reshape(10,1) x = Variable(np.array(word, dtype=np.int32)) i = model.w_xi(word) c, p = lstm(c_prev, model.w_ip(i) + model.w_pp(p_prev)) enable = np.asarray([[(x_i != -1) for i in range(HIDDEN_SIZE)] for x_i in x.data.reshape(10,)]) enable = Variable(enable) _c = [] _p = [] for i in range(BATCH_SIZE): _ = where(enable[i], c[i], c_prev[i]) _c.append(_.data) for i in range(BATCH_SIZE): _ = where(enable[i], p[i].data, p_prev[i].data) _p.append(_.data) c_prev = Variable(np.asarray(_c,dtype = np.float32)) p_prev = Variable(np.asarray(_p,dtype = np.float32)) # エンコーダ -> デコーダ c, q = lstm(c, model.w_pq(p)) # デコーダ if training: # 学習時はyとして正解の翻訳を使い、forwardの結果として累積損失を返す。 accum_loss = 0 for word in trg_sentence: j = tanh(model.w_qj(q)) y = model.w_jy(j) #y = functions.reshape(y,(1,1,TRG_VOCAB_SIZE)) #_t = np.zeros(TRG_VOCAB_SIZE,dtype = np.int32) #_t[word] = 1 t = np.asarray(word, dtype= np.int32) #t = t.reshape(1,BATCH_SIZE) t = Variable(t) accum_loss += softmax_cross_entropy(y,t) c, q = lstm(c, model.w_yq(t) + model.w_qq(q)) return accum_loss else: # 予測時には翻訳器が生成したyを次回の入力に使い、forwardの結果として生成された単語列を返す。 # yの中で最大の確率を持つ単語を選択していくが、softmaxを取る必要はない。 hyp_sentence = [] while len(hyp_sentence) < 100: # 100単語以上は生成しないようにする j = tanh(model.w_qj(q)) y = model.w_jy(j) word = y.data.argmax(1)[0] if word == END_OF_SENTENCE: break # 終端記号が生成されたので終了 hyp_sentence.append(convert_to_your_trg_str(word)) c, q = lstm(c, model.w_yq(y), model.w_qq(q)) return hyp_sentence
def forward(model, batch, num_samples, word_keep_rate, UNK, train=True): batch_size = batch.shape[0] xp = model.xp use_gpu = (xp == cuda.cupy) if use_gpu: batch = cuda.to_gpu(batch) model.reset_state() model.zerograds() # encode batch_length = len(batch[0])-1 for i in range(batch_length): w = Variable(batch[:, i]) model.encode(w, train=train) # infer q(z|x) model.infer(train=train) # compute KL KL = 0 for i in range(model.num_layers): # h mu, sigma = model.hmus[i], model.hsigmas[i] KL += -F.sum((1 + 2 * F.log(sigma) - sigma*sigma - mu*mu) / 2) # c mu, sigma = model.cmus[i], model.csigmas[i] KL += -F.sum((1 + 2 * F.log(sigma) - sigma*sigma - mu*mu) / 2) KL /= batch_size # draw and decode cross_entropies = [] if not train: ys, ts = [], [] UNKs = np.array([UNK for _ in range(batch_size)], dtype=np.int32) if use_gpu: UNKs = cuda.to_gpu(UNKs) for _ in range(num_samples): cross_entropies.append(0) if not train: ys.append([]) ts.append([]) if train == True: model.set_by_sample(train=train) else: model.set_by_MLE(train=train) last_w = None for i in range(batch_length): w, next_w = Variable(batch[:, i]), Variable(batch[:, i+1]) # word dropout masked_w = batch[:, i] if np.random.uniform() > word_keep_rate: enable = (masked_w != -1) masked_w = F.where(enable, masked_w, UNKs) y = model.decode(masked_w, train=train) cross_entropies[-1] += F.softmax_cross_entropy(y, next_w) if not train: ys[-1].append(xp.argmax(y.data, axis=1)) ts[-1].append(next_w.data) last_w = next_w if not train: ys[-1] = xp.vstack(ys[-1]).T ts[-1] = xp.vstack(ts[-1]).T if use_gpu: ys[-1] = cuda.to_cpu(ys[-1]) ts[-1] = cuda.to_cpu(ts[-1]) if train: return (KL, cross_entropies) else: assert(len(cross_entropies) == 1 and len(ys) == 1 and len(ts) == 1) return (KL, (cross_entropies, ys, ts))
def softmax(x, mask, zero_pad, axis): x_explogsoftmax = F.exp(logsoftmax_no_mask(x, mask, zero_pad, axis)) return F.where(mask, x_explogsoftmax, zero_pad)
def logsoftmax(x, mask, zero_pad, axis): return F.where(mask, logsoftmax_no_mask(x, mask, zero_pad, axis), zero_pad)
def logsumexp(x, mask, zero_pad, axis): x_exp = F.where(mask, F.exp(x), zero_pad) return F.log(F.sum(x_exp, axis=axis))