예제 #1
0
    def _feature_repl(hs_flatten, pairs, ckeys, lengths):
        xp = chainer.cuda.get_array_module(hs_flatten)
        begins, ends = pairs.T
        begins_ = xp.asarray(begins)
        ends_ = xp.asarray(ends)
        ckeys_ = xp.asarray(ckeys)

        h_b = F.embed_id(begins_, hs_flatten)
        h_b_pre = F.embed_id(begins_ - 1, hs_flatten, ignore_label=-1)
        out_of_span = np.insert(lengths[:-1].cumsum(), 0, 0) - 1
        is_out_of_span = np.isin(begins - 1, out_of_span)
        h_b_pre = F.where(
            xp.asarray(is_out_of_span)[:, None], xp.zeros_like(h_b_pre.data),
            h_b_pre)
        h_e = F.embed_id(ends_, hs_flatten)
        h_e_post = F.embed_id(ends_ + 1, hs_flatten, hs_flatten.shape[0])
        out_of_span = lengths.cumsum()
        is_out_of_span = np.isin(ends + 1, out_of_span)
        h_e_post = F.where(
            xp.asarray(is_out_of_span)[:, None], xp.zeros_like(h_e_post.data),
            h_e_post)
        h_k_pre = F.embed_id(ckeys_ - 1, hs_flatten)
        h_k_post = F.embed_id(ckeys_ + 1, hs_flatten)

        repl1 = F.absolute(h_b_pre * (h_b - h_k_post))
        repl2 = F.absolute(h_e_post * (h_e - h_k_pre))
        return repl1, repl2
예제 #2
0
    def __call__(self, y, y_label, c_pre, h_pre, train=True):
        # input word embedding
        e = F.tanh(self.ye(y))
        e_l = F.tanh(self.le(y_label))

        # LSTM
        c_tmp, h_tmp = F.lstm(
            c_pre,
            F.dropout(self.eh(F.concat(
                (e, e_l))), ratio=0.2, train=train) + self.hh(h_pre))
        enable = chainer.Variable(
            chainer.Variable(y.data != -1).data.reshape(len(y), 1))
        c_next = F.where(enable, c_tmp, c_pre)
        h_next = F.where(enable, h_tmp, h_pre)

        # output using at
        at = F.sigmoid(self.vt(h_next))
        #print(at.data)
        pg_pre = self.wg(h_next)
        pg = pg_pre * F.broadcast_to(
            (1 - at), shape=(pg_pre.data.shape[0], pg_pre.data.shape[1]))
        pe_pre = self.we(h_next)
        pe = pe_pre * F.broadcast_to(
            at, shape=(pe_pre.data.shape[0], pe_pre.data.shape[1]))

        # broadcast を使わない ver.
        # pg = chainer.Variable(self.wg(h_next).data * (1 - at).data)
        # pe = chainer.Variable(self.we(h_next).data * at.data)
        return F.concat((pg, pe)), at, c_next, h_next
예제 #3
0
    def __call__(self, w, train=True, dpratio=0.5):

        x = self.embed(w)
        self.maybe_init_state(len(x.data), x.data.dtype)

        for i in range(self.num_layers):

            if self.ignore_label is not None:
                enable = (x.data != 0)

            c = F.dropout(self.get_c(i), train=train, ratio=dpratio)
            h = F.dropout(self.get_h(i), train=train, ratio=dpratio)
            x = F.dropout(x, train=train, ratio=dpratio)
            c, h = self.get_l(i)(c, h, x)

            if self.ignore_label != None:
                self.set_c(i, F.where(enable, c, self.get_c(i)))
                self.set_h(i, F.where(enable, h, self.get_h(i)))
            else:
                self.set_c(i, c)
                self.set_h(i, h)

            x = self.get_h(i)
            
        x = F.dropout(x, train=train, ratio=dpratio)
        return self.hy(x)
    def __call__(self, fp, y):
        mean_activation = F.mean(fp, axis=0)
        rho = 0.01
        zero_array = chainer.Variable(
            numpy.zeros(mean_activation.shape, dtype=numpy.float32))
        small_array = zero_array + 0.001

        cond = (mean_activation.data != 0)
        cond = chainer.Variable(cond)
        mean_activation = F.where(cond, mean_activation, small_array)

        self.kl_div = rho * F.sum(
            F.where(
                cond,
                self.p * F.log(self.p / mean_activation) +
                (1 - self.p) * F.log(
                    (1 - self.p) / (1 - mean_activation)), zero_array))
        # sampling z
        eps = numpy.random.uniform(0.0, 1.0,
                                   fp.data.shape).astype(numpy.float32)
        eps = chainer.Variable(eps)

        if self.train == True:
            z = self.logistic_func(fp - eps)
            #z = fp
        else:
            z = fp
        h = F.relu(self.l1(z))
        h = F.relu(self.l2(h))
        h = self.l3(h)
        self.rec_loss = F.sigmoid_cross_entropy(h, y)
        self.accuracy = F.binary_accuracy(h, y)
        self.loss = self.rec_loss + self.kl_div
        return self.loss, self.accuracy
    def __call__(self, x, mask):
        #h = self.c(x) - self.b
        self.m.W.data = self.xp.array(self.maskW)  #mask windows are set by 1
        h = self.c(x * mask)  #(B,C,H,W)
        B, C, H, W = h.shape
        b = F.transpose(F.broadcast_to(self.c.b, (B, H, W, C)), (0, 3, 1, 2))
        h = h - b
        mask_sums = self.m(mask)
        mask_new = (self.xp.sign(mask_sums.data - 0.5) + 1.0) * 0.5
        mask_new_b = mask_new.astype("bool")

        mask_sums = F.where(
            mask_new_b, mask_sums,
            0.01 * Variable(self.xp.ones(mask_sums.shape).astype("f")))
        h = h / mask_sums + b

        mask_new = Variable(mask_new)
        h = F.where(mask_new_b, h,
                    Variable(self.xp.zeros(h.shape).astype("f")))

        #elif self.sample=="up":
        #    h = F.unpooling_2d(x, 2, 2, 0, cover_all=False)
        #    h = self.c(h)
        #else:
        #    print("unknown sample method %s"%self.sample)
        if self.bn:
            h = self.batchnorm(h)
        if self.noise:
            h = add_noise(h)
        if self.dropout:
            h = F.dropout(h)
        if not self.activation is None:
            h = self.activation(h)
        return h, mask_new
예제 #6
0
    def __call__(self, w, train=True, dpratio=0.2):

        x = self.embed(w)

        self.maybe_init_state(len(x.data), x.data.dtype)

        for i in range(self.num_layers):

            c = F.dropout(self.cs[i], train=train, ratio=dpratio)
            h = self.xhs[i](F.dropout(x, train=train, ratio=dpratio))
            + self.hhs[i](F.dropout(self.hs[i], train=train, ratio=dpratio))

            assert( c.data.shape == (len(x.data), self.hidden_size) )
            assert( h.data.shape == (len(x.data), 4*self.hidden_size) )

            c, h = F.lstm(c, h)

            assert( c.data.shape == (len(x.data), self.hidden_size) )
            assert( h.data.shape == (len(x.data), self.hidden_size) )
            if self.ignore_label != None:
                enable = (x.data != 0)
                self.cs[i] = F.where(enable, c , self.cs[i])
                self.hs[i] = F.where(enable, h , self.hs[i])
            else:
                self.cs[i] = c
                self.hs[i] = h
            x = self.hs[i]
        return self.hy(x)
예제 #7
0
 def __call__(self, x, c_pre, h_pre, train=True):
     e = F.tanh(self.xe(x))
     c_tmp, h_tmp = F.lstm(c_pre, self.eh(e) + self.hh(h_pre))
     enable = chainer.Variable(chainer.Variable(x.data != -1).data.reshape(len(x), 1))
     c_next = F.where(enable, c_tmp, c_pre)
     h_next = F.where(enable, h_tmp, h_pre)
     return c_next, h_next
예제 #8
0
    def __call__(self, x, t):
        h = self.base(x, layers=['res5'])['res5']
        self.cam = h
        h = _global_average_pooling_2d(h)
        ################################################################################
        #                           ResNet50の後ろにArcFace実装
        ################################################################################
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        cosine = F.linear(F.normalize(h), F.normalize(self.weight)) # fc8
        sine = F.sqrt(F.clip((1.0 - F.square(cosine)),0, 1))
        phi = cosine * cos_m - sine * sin_m
        if easy_margin:
            phi = F.where(cosine.data > 0, phi, cosine)
        else:
            phi = F.where(cosine.data > th, phi, cosine - mm)
        # --------------------------- convert label to one-hot ---------------------------
        one_hot = cp.eye(10)[t].astype(cp.float32)
        one_hot = Variable(one_hot)
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= s
        ################################################################################
        #h = self.fc(h)

        return output
예제 #9
0
    def _length_aware_softmax(e, l0, l1, xp):
        # e: (B, T0, T1)
        bs, t0, t1 = e.shape
        l0 = l0.reshape((bs, 1, 1))
        l1 = l1.reshape((bs, 1, 1))
        mask0 = (xp.tile(xp.arange(t0).reshape(1, t0, 1),
                         (bs, 1, 1)) < l0).astype(e.dtype)
        mask1 = (xp.tile(xp.arange(t1).reshape(1, t1, 1),
                         (bs, 1, 1)) < l1).astype(e.dtype)
        mask = (xp.matmul(mask0, mask1.swapaxes(1, 2))).astype(np.bool)
        # mask: (B, T0, T1)
        mask = chainer.Variable(mask)
        padding = chainer.Variable(xp.zeros(e.shape, dtype=e.dtype))

        e_max = F.max(e, keepdims=True)
        e_masked = F.where(mask, e, padding)
        e_masked = e_masked - F.broadcast_to(e_max, e.shape)

        e_sum0 = F.reshape(F.logsumexp(e_masked, axis=1), (bs, 1, t1))
        e_sum1 = F.reshape(F.logsumexp(e_masked, axis=2), (bs, t0, 1))

        s1 = F.exp(e_masked - F.broadcast_to(e_sum0, e.shape))
        s2 = F.exp(e_masked - F.broadcast_to(e_sum1, e.shape))
        s1 = F.where(mask, s1, padding)
        s2 = F.where(mask, s2, padding)
        return s1, s2
    def __call__(self, x, mask):
        self.m.W.data = self.xp.array(self.maskW)  #mask windows are set by 1
        h = self.c(x * mask)  #(B,C,H,W)
        B, C, H, W = h.shape
        b = F.transpose(F.broadcast_to(self.c.b, (B, H, W, C)), (0, 3, 1, 2))
        h = h - b
        mask_sums = self.m(mask)
        mask_new = (self.xp.sign(mask_sums.data - 0.5) + 1.0) * 0.5
        mask_new_b = mask_new.astype("bool")

        mask_sums = F.where(
            mask_new_b, mask_sums,
            0.01 * Variable(self.xp.ones(mask_sums.shape).astype("f")))
        h = h / mask_sums + b

        mask_new = Variable(mask_new)
        h = F.where(mask_new_b, h,
                    Variable(self.xp.zeros(h.shape).astype("f")))

        if self.bn:
            h = self.batchnorm(h)
        if self.noise:
            h = add_noise(h)
        if self.dropout:
            h = F.dropout(h)
        if not self.activation is None:
            h = self.activation(h)
        return h, mask_new
예제 #11
0
 def __call__(self, x, c_pre, h_pre, train=True):
     e = F.tanh(self.xe(x))
     c_tmp, h_tmp = F.lstm(c_pre, self.eh(e) + self.hh(h_pre))
     enable = chainer.Variable(chainer.Variable(x.data != -1).data.reshape(len(x), 1))    # calculate flg whether x is -1 or not
     c_next = F.where(enable, c_tmp, c_pre)                                   # if x!=-1, c_tmp . elseif x=-1, c_pre.
     h_next = F.where(enable, h_tmp, h_pre)                                   # if x!=-1, h_tmp . elseif x=-1, h_pre.
     return c_next, h_next
예제 #12
0
 def __call__(self, embeded_x, m_prev, h_prev, feed_previous):
     lstm_in = F.dropout(self.W_e(embeded_x) + self.W_h(h_prev),
                         ratio=self.dropout_ratio)
     m_tmp, h_tmp = F.lstm(m_prev, lstm_in)
     m = F.where(feed_previous, m_prev, m_tmp)
     h = F.where(feed_previous, h_prev, h_tmp)
     return m, h
예제 #13
0
 def __call__(self, y, c_pre, h_pre, train=True):
     e = F.tanh(self.ye(y))
     c_tmp, h_tmp = F.lstm(c_pre, F.dropout(self.eh(e), ratio=0.1, train=train) + self.hh(h_pre))
     enable = chainer.Variable(chainer.Variable(y.data != -1).data.reshape(len(y), 1))
     c_next = F.where(enable, c_tmp, c_pre)
     h_next = F.where(enable, h_tmp, h_pre)
     f = F.tanh(self.hf(h_next))
     return self.fy(f), c_next, h_next
예제 #14
0
    def __call__(self, x_block, y_in_block, y_out_block):

        batch = len(x_block)
        #embed
        ex_block = F.dropout(self.make_input_embedding(self.embed_x, x_block),
                             self.dropout)
        ey_block = F.dropout(
            self.make_input_embedding(self.embed_y, y_in_block), self.dropout)
        eyy_block = F.dropout(
            self.make_input_embedding(self.embed_yy, y_in_block), self.dropout)
        eys = F.transpose(ey_block, (0, 2, 1))
        eyys = F.transpose(eyy_block, (0, 2, 1))
        #gcnn
        h = F.expand_dims(ex_block, axis=1)
        for i in range(self.stack):
            h = self.gcnn[i](h)
        h = F.dropout(F.squeeze(h, axis=1), self.dropout)
        #Nsteolstm
        eys2 = [i for i in eys]
        eyys2 = [i for i in eyys]
        _, _, oss = self.decoder(None, None, eys2)
        _, _, oss2 = self.decoder2(None, None, eyys2)
        ss = F.stack(oss, axis=0)
        ss2 = F.stack(oss2, axis=0)
        #mask_make
        mask = (y_in_block[:, :, None] >= 0) * self.xp.ones(
            (self.batch, 1, self.n_units), dtype=bool)
        ss = F.where(mask, ss, self.xp.full(ss.shape, 0, 'f'))
        #weight_calclate
        batch_A = F.batch_matmul(ss, h) * self.scale_score
        mask = (x_block[:, 0:len(x_block[0]) - self.stack *
                        (self.width - 1)][:, None, :] >=
                0) * (y_in_block[:, :, None] >= 0)
        batch_A = F.where(mask, batch_A,
                          self.xp.full(batch_A.shape, -self.xp.inf, 'f'))
        batch_A = F.softmax(batch_A, axis=2)
        batch_A = F.where(self.xp.isnan(batch_A.data),
                          self.xp.zeros(batch_A.shape, 'f'), batch_A)
        batch_A, h = F.broadcast(batch_A[:, None], h[:, :, None])
        batch_C = F.sum(batch_A * h, axis=3)

        e = F.transpose(batch_C, (0, 2, 1))
        e = F.squeeze(F.concat(F.split_axis(e, self.batch, axis=0), axis=1))
        ss2 = F.squeeze(F.concat(F.split_axis(ss2, self.batch, axis=0),
                                 axis=1))
        t = (self.We(e) + self.Ws(ss2))
        t = F.dropout(t, self.dropout)

        concat_ys_out = F.concat(y_out_block, axis=0)
        loss = F.sum(F.softmax_cross_entropy(t, concat_ys_out,
                                             reduce='no')) / batch

        chainer.report({'loss': loss.data}, self)
        n_words = concat_ys_out.shape[0]
        perp = self.xp.exp(loss.data * batch / n_words)
        chainer.report({'perp': perp}, self)
        return loss
예제 #15
0
 def __call__(self, y, c_pre, h_pre, hs_enc):
     e = F.tanh(self.ye(y))
     c_tmp, h_tmp = F.lstm(c_pre, self.eh(e) + self.hh(h_pre))
     enable = chainer.Variable(chainer.Variable(y.data != -1).data.reshape(len(y), 1))
     c_next = F.where(enable, c_tmp, c_pre)
     h_next = F.where(enable, h_tmp, h_pre)
     ct = self.calculate_alpha(h_next, hs_enc)
     f = F.tanh(self.wc(ct) + self.wh(h_next))
     return self.fy(f), c_next, h_next
예제 #16
0
 def __call__(self, embeded_x, m_prev, h_prev, x):
     batch_size = embeded_x.shape[0]
     lstm_in = self.W(embeded_x) + self.U(h_prev)
     m_tmp, h_tmp = F.lstm(m_prev, lstm_in)
     # flags if feeding previous output
     feed_prev = F.broadcast_to(F.expand_dims(x.data != IGNORE_LABEL, -1),
                                (batch_size, self.hidden_size))
     m = F.where(feed_prev, m_tmp, m_prev)
     h = F.where(feed_prev, h_tmp, h_prev)
     return m, h
예제 #17
0
    def __call__(self, x, enc_out=None, mask=None):
        """
            args
                x: paralleled main features in the model
                   Variable in (batch, hidden_dim, length)
                u: hidden features from Encoder
                   Variable in (batch, hidden_dim, length)
                mask: padding-mask or future-mask
                   xp-array in (batch, length, length)
                   an element takes 'False' when pad/future, otherwise 'True'
            returns
        """
        # ksize-1-convolution results in parallel linear projections
        if self.self_attention:
            qkv = F.squeeze(self.W(F.expand_dims(x, axis=3)), axis=3)
            query, key, value = F.split_axis(qkv, 3, axis=1)
        else:
            query = F.squeeze(self.W_Q(F.expand_dims(x, axis=3)), axis=3)
            kv = F.squeeze(self.W_KV(F.expand_dims(enc_out, axis=3)), axis=3)
            key, value = F.split_axis(kv, 2, axis=1)

        # make q,k,v into (batch*parallel, dim/parallel, length)shape
        query = F.concat(F.split_axis(query, self.parallel_num, axis=1),
                         axis=0)
        key = F.concat(F.split_axis(key, self.parallel_num, axis=1), axis=0)
        value = F.concat(F.split_axis(value, self.parallel_num, axis=1),
                         axis=0)
        mask = self.xp.concatenate([mask] * self.parallel_num, axis=0)

        attention_weight = F.batch_matmul(query, key, transa=True) * self.scale
        attention_weight = F.where(
            mask, attention_weight,
            self.xp.full(attention_weight.shape, -np.inf, dtype=np.float32))
        attention_weight = F.softmax(attention_weight, axis=2)
        attention_weight = F.dropout(attention_weight, self.dropout_rate)
        attention_weight = F.where(
            self.xp.isnan(attention_weight.data),
            self.xp.full(attention_weight.shape, 0, dtype=np.float32),
            attention_weight)
        self.attention_weight = copy.deepcopy(attention_weight.data)

        # attention: (batch, q-length, k-length) -> (batch, 1, q-length, k-length)
        # value: (batch, dim/parallel, k-length) -> (batch, dim/parallel, 1, k-length)
        attention_weight, value = F.broadcast(attention_weight[:, None],
                                              value[:, :, None])
        weighted_sum = F.sum(attention_weight * value, axis=3)
        weighted_sum = F.concat(F.split_axis(weighted_sum,
                                             self.parallel_num,
                                             axis=0),
                                axis=1)

        weighted_sum = F.squeeze(self.linear(
            F.expand_dims(weighted_sum, axis=3)),
                                 axis=3)
        return weighted_sum
 def step(self, y, embeded_y, m_prev, s_prev, batch_size):
     # decode once
     lstm_in = F.dropout(self.W_e(embeded_y) + self.W_s(s_prev),
                         ratio=self.dropout_ratio)
     m_tmp, s_tmp = F.lstm(m_prev, lstm_in)
     feed_previous = F.broadcast_to(
         F.expand_dims(y.data == self.ignore_label, -1),
         (batch_size, self.decoder_hidden_size))
     m = F.where(feed_previous, m_prev, m_tmp)
     s = F.where(feed_previous, s_prev, s_tmp)
     return m, s
예제 #19
0
def _log_ndtr(x):
    """Log CDF of the standard normal distribution.

    See https://github.com/scipy/scipy/blob/master/scipy/special/cephes/ndtr.c
    """
    if not isinstance(x, chainer.Variable):
        x = chainer.Variable(x)
    return F.where(
        x.data > 6, -_ndtr(-x),
        F.where(x.data > -14, _safe_log(_ndtr(x)),
                -0.5 * x * x - _safe_log(-x) - 0.5 * np.log(2 * np.pi)))
예제 #20
0
 def log_prob(self, x):
     unclipped_elementwise_log_prob = elementwise_gaussian_log_pdf(
         x, self.mean, self.var, self.ln_var)
     std = self.var**0.5
     low_log_prob = _gaussian_log_cdf(self.low, self.mean, std)
     high_log_prob = _gaussian_log_sf(self.high, self.mean, std)
     x_data = _unwrap_variable(x)
     elementwise_log_prob = F.where(
         (x_data <= self.low.data), low_log_prob,
         F.where(x_data >= self.high.data, high_log_prob,
                 unclipped_elementwise_log_prob))
     return F.sum(elementwise_log_prob, axis=1)
예제 #21
0
 def __call__(self, y, t, c_pre, h_pre, hs_enc, train=True):
     e = F.tanh(self.ye(y))
     c_tmp, h_tmp = F.lstm(c_pre, F.dropout(self.eh(e), ratio=0.2, train=train) + self.hh(h_pre))
     enable = chainer.Variable(chainer.Variable(y.data != -1).data.reshape(len(y), 1))
     c_next = F.where(enable, c_tmp, c_pre)
     h_next = F.where(enable, h_tmp, h_pre)
     ct = self.calculate_alpha(h_next, hs_enc)
     f = F.tanh(self.wc(ct) + self.wh(h_next))
     if train:
         return self.fy(f, t), c_next, h_next        # return a loss value
     else:
         return self.test_out(f), c_next, h_next     # return predict vectors
예제 #22
0
def batched_triangle_intersect_(p0, p1, p2, eps, fn, id, ro, rd, t0, t1):
    xp = chainer.backend.get_array_module(ro)
    BB = p0.shape[0]
    EB = p0.shape[0]
    _, _, H, W = ro.shape[:4]

    p0 = F.broadcast_to(p0.reshape((BB, 3, 1, 1)), (BB, 3, H, W))
    p1 = F.broadcast_to(p1.reshape((BB, 3, 1, 1)), (BB, 3, H, W))
    p2 = F.broadcast_to(p2.reshape((BB, 3, 1, 1)), (BB, 3, H, W))
    fn = F.broadcast_to(fn.reshape((BB, 3, 1, 1)), (BB, 3, H, W))
    id = F.broadcast_to(id.reshape((BB, 1, 1, 1)), (BB, 1, H, W))
    eps = F.broadcast_to(eps.reshape((EB, 1, 1, 1)), (BB, 1, H, W))
    ro = F.broadcast_to(ro.reshape((1, 3, H, W)), (BB, 3, H, W))
    rd = F.broadcast_to(rd.reshape((1, 3, H, W)), (BB, 3, H, W))
    t0 = F.broadcast_to(t0.reshape((1, 1, H, W)), (BB, 1, H, W))
    t1 = F.broadcast_to(t1.reshape((1, 1, H, W)), (BB, 1, H, W))

    aa = p0 - ro

    A = vdot(aa, fn)
    B = vdot(rd, fn)
    B = F.where(xp.abs(B.data) < eps.data, eps, B)

    #tx = F.where((xp.abs(A.data) < 1e-6)&(xp.abs(B.data) < 1e-6), t1, A / B)

    tx = F.maximum(t0, F.minimum(A / B, t1))
    p = ro + tx * rd

    e0 = p0.data - p.data
    e1 = p1.data - p.data
    e2 = p2.data - p.data
    n01 = vcross_(e0, e1, xp)
    n12 = vcross_(e1, e2, xp)
    n20 = vcross_(e2, e0, xp)

    MASK_P = is_positive_(vdot_(n01, n12, xp))
    MASK_Q = is_positive_(vdot_(n12, n20, xp))
    MASK_R = is_positive_(vdot_(n20, n01, xp))

    MASK_B = is_positive_(xp.abs(B.data))

    #MASK_TN = is_positive(tx)
    MASK_T0 = is_positive_(tx.data - t0.data)
    MASK_T1 = is_positive_(t1.data - tx.data)

    b = MASK_P & MASK_Q & MASK_R & MASK_B & MASK_T0 & MASK_T1

    t = F.where(b, tx, t1)
    p = ro + t * rd

    n = -xp.sign(vdot_(rd.data, fn.data, xp)) * fn

    return b, t, p, n, id
예제 #23
0
def _ndtr(a):
    """CDF of the standard normal distribution.

    See https://github.com/scipy/scipy/blob/master/scipy/special/cephes/ndtr.c
    """
    if not isinstance(a, chainer.Variable):
        a = chainer.Variable(a)
    x = a * NPY_SQRT1_2
    z = abs(x)
    half_erfc_z = 0.5 * F.erfc(z)
    return F.where(z.data < NPY_SQRT1_2, 0.5 + 0.5 * F.erf(x),
                   F.where(x.data > 0, 1.0 - half_erfc_z, half_erfc_z))
예제 #24
0
def mixture_of_discretized_logistics_nll(x, y):
    """
    Args:
        x: (b, c, n, n)
        y: (b, 10*n_mix, n, n)
    """
    xp = get_array_module(x)
    n_mix = y.shape[1] // 10
    logit_prob = y[:, :n_mix, :, :]
    y = F.reshape(y[:, n_mix:, :, :], x.shape + (n_mix * 3, ))
    mean = y[:, :, :, :, 0:n_mix]
    log_scale = y[:, :, :, :, n_mix:2 * n_mix]
    log_scale = F.maximum(log_scale, -7 * xp.ones(log_scale.shape, dtype='f'))
    coeff = F.tanh(y[:, :, :, :, 2 * n_mix:3 * n_mix])

    x = xp.repeat(xp.expand_dims(x, 4), n_mix, 4)
    m1 = F.expand_dims(mean[:, 0, :, :, :], 1)
    m2 = F.expand_dims(
        mean[:, 1, :, :, :] + coeff[:, 0, :, :, :] * x[:, 0, :, :, :], 1)
    m3 = F.expand_dims(
        (mean[:, 2, :, :, :] + coeff[:, 1, :, :, :] * x[:, 0, :, :, :] +
         coeff[:, 2, :, :, :] * x[:, 1, :, :, :]), 1)
    mean = F.concat([m1, m2, m3])
    centered_x = x - mean
    inv_std = F.exp(-log_scale)
    max_in = inv_std * (centered_x + 1. / 255.)
    cdf_max = F.sigmoid(max_in)
    min_in = inv_std * (centered_x - 1. / 255.)
    cdf_min = F.sigmoid(min_in)
    log_cdf_max = max_in - F.softplus(max_in)  # 0
    log_one_minus_cdf_min = -F.softplus(min_in)  # 255
    cdf_delta = cdf_max - cdf_min  # 0 ~ 255
    mid_in = inv_std * centered_x
    log_pdf_mid = mid_in - log_scale - 2. * F.softplus(mid_in)  # mid

    log_prob = F.where(
        x < -0.999, log_cdf_max,
        F.where(
            x > 0.999, log_one_minus_cdf_min,
            F.where(
                cdf_delta.array > 1e-5,
                F.log(
                    F.maximum(cdf_delta,
                              xp.ones(cdf_delta.shape, dtype='f') * 1e-12)),
                log_pdf_mid - xp.log(127.5))))

    log_prob = F.transpose(F.sum(log_prob, 1), (0, 3, 1, 2))
    log_prob = log_prob + log_prob_from_logit(logit_prob)

    loss = F.logsumexp(log_prob, 1)
    loss = F.sum(loss, axis=(1, 2))
    return -F.mean(loss)
예제 #25
0
    def __call__(self, x, t, qt=None):
        # forward
        z = self.enc(x)
        e = self.vq(z)
        e_ = self.vq(chainer.Variable(z.data))
        scale = t.shape[2] // e.shape[2]
        if self.quantize == 'mulaw':
            y_hat = self.dec(qt, F.unpooling_2d(e, (scale, 1),
                                                cover_all=False))
        elif self.quantize == 'mixture':
            y_hat = self.dec(x, F.unpooling_2d(e, (scale, 1), cover_all=False))

        # calculate loss
        if self.quantize == 'mulaw':
            loss1 = F.softmax_cross_entropy(y_hat, t)
        elif self.quantize == 'mixture':
            y_hat = y_hat[:, :30]
            logit_probs, means, log_scales = F.split_axis(y_hat, 3, 1)
            log_scales = F.relu(log_scales + 7) - 7
            y = F.broadcast_to(t, means.shape)

            centered_y = y - means
            inv_stdv = F.exp(-log_scales)
            plus_in = inv_stdv * (centered_y + 1 / (2**16))
            cdf_plus = F.sigmoid(plus_in)
            min_in = inv_stdv * (centered_y - 1 / (2**16))
            cdf_min = F.sigmoid(min_in)

            log_cdf_plus = plus_in - F.softplus(plus_in)
            log_one_minus_cdf_min = -F.softplus(min_in)

            cdf_delta = cdf_plus - cdf_min
            cdf_delta = F.relu(cdf_delta - 1e-12) + 1e-12

            y = F.broadcast_to(t, log_cdf_plus.shape).array
            log_probs = F.where(
                y < -0.999, log_cdf_plus,
                F.where(y > 0.999, log_one_minus_cdf_min, F.log(cdf_delta)))
            log_probs = log_probs + F.log_softmax(logit_probs)
            loss1 = -F.mean(log_probs)
        loss2 = F.mean((chainer.Variable(z.data) - e_)**2)
        loss3 = self.beta * F.mean((z - chainer.Variable(e.data))**2)
        loss = loss1 + loss2 + loss3
        chainer.reporter.report(
            {
                'loss1': loss1,
                'loss2': loss2,
                'loss3': loss3,
                'loss': loss
            }, self)
        return loss1, loss2, loss3
예제 #26
0
    def __call__(self, x):
        """Updates the internal state and returns the LSTM outputs.
        Args:
            x (~chainer.Variable): A new batch from the input sequence.
        Returns:
            ~chainer.Variable: Outputs of updated LSTM units.
        """
        if self.upward.has_uninitialized_params:
            in_size = x.size // x.shape[0]
            self.upward._initialize_params(in_size)
            self._initialize_params()

        batch = x.shape[0]
        lstm_in = self.upward(x)
        h_rest = None
        if self.h is not None:
            h_size = self.h.shape[0]
            if batch == 0:
                h_rest = self.h
            elif h_size < batch:
                msg = ('The batch size of x must be equal to or less than the '
                       'size of the previous state h.')
                raise TypeError(msg)
            elif h_size > batch:
                h_update, h_rest = split_axis.split_axis(self.h, [batch],
                                                         axis=0)
                lstm_in += self.lateral(h_update)
            else:
                lstm_in += self.lateral(self.h)
        if self.c is None:
            xp = self.xp
            self.c = variable.Variable(xp.zeros((batch, self.state_size),
                                                dtype=x.dtype),
                                       volatile='auto')
        # self.c, y = lstm.lstm(self.c, lstm_in)

        c, y = lstm.lstm(self.c, lstm_in)
        enable = (x.data != -1)
        self.c = where(enable, c, self.c)
        if self.h is not None:
            y = where(enable, y, self.h)

        if h_rest is None:
            self.h = y
        elif len(y.data) == 0:
            self.h = h_rest
        else:
            self.h = concat.concat([y, h_rest], axis=0)

        return y
예제 #27
0
    def __call__(self, x, z, mask):
        # split version # little slow
        # TODO: shape check
        """
        Input shapes:
            q=(b, units, n_querys), k=(b, units, n_keys),
            m=(b, n_querys, n_keys)
        """

        query = seq_func(self.W_Q, x)
        key = seq_func(self.W_K, z)
        value = seq_func(self.W_V, z)
        batch, n_units, n_querys = query.shape
        n_keys = key.shape[-1]

        children_query = F.split_axis(query, self.h, axis=1)
        # [(b, n_units // h, n_querys), ...]
        children_key = F.split_axis(key, self.h, axis=1)
        # [(b, n_units // h, n_keys), ...]
        children_value = F.split_axis(value, self.h, axis=1)
        # [(b, n_units // h, n_keys), ...]
        c_list = []
        for q, k, v in zip(children_query, children_key, children_value):
            pre_a = F.batch_matmul(q, k, transa=True)
            # (b, n_querys, n_keys)
            pre_a /= (n_units // self.h)**0.5
            minfs = self.xp.full(pre_a.shape, -np.inf, pre_a.dtype)
            pre_a = F.where(mask, pre_a, minfs)
            a = F.softmax(pre_a, axis=2)

            # if values in axis=2 are all -inf, they become nan.
            # thus do re-mask.
            a = F.where(self.xp.isnan(a.data),
                        self.xp.zeros(a.shape, dtype=a.dtype), a)
            a = F.dropout(a, ratio=self.dropout)
            # (b, n_querys, n_keys)

            v = F.broadcast_to(v[:, :, None],
                               (batch, n_units // self.h, n_querys, n_keys))
            # (b, n_units // h, n_querys, n_keys)

            a = F.broadcast_to(a[:, None],
                               (batch, n_units // self.h, n_querys, n_keys))
            # (b, n_units // h, n_querys, n_keys)

            pre_c = a * v
            c = F.sum(pre_c, axis=3)  # (b, units // h, n_querys)
            c_list.append(c)
        c = F.concat(c_list, axis=1)
        return c
예제 #28
0
 def __call__(self, y, m_prev, s_prev, h_forward, h_backword, enable, disable_value):
     # m is memory cell of lstm, s is previous hidden output
     # calculate attention
     c = self._attention(h_forward, h_backword, s_prev, enable, disable_value)
     # decode once
     embeded_y = self.E(y)
     batch_size = y.shape[0]
     lstm_in = self.W(embeded_y) + self.U(s_prev) + self.C(c)
     m_tmp, s_tmp = F.lstm(m_prev, lstm_in)
     feed_prev = F.broadcast_to(F.expand_dims(y.data != IGNORE_LABEL, -1),
                                (batch_size, self.hidden_size))
     m = F.where(feed_prev, m_tmp, m_prev)
     s = F.where(feed_prev, s_tmp, s_prev)
     t = self.U_o(s) + self.V_o(embeded_y) + self.C_o(c)
     return self.W_o(t), m, s
예제 #29
0
    def wsd_with_tc(self, sent, trf_encoded_matrix, labels):

        ### WSD ###

        if self.model_type == "TRF-Multi" or self.model_type == "TRF-Delay-Multi":
            y_wsd = self.wsd_only(trf_encoded_matrix, labels)
        elif self.model_type == "TRF-Sequential":
            y_wsd, task_type = self.wsd_model(sent, None, None,
                                              True)  ## 読み込みsequential

        y_wsd_soft = F.softmax(y_wsd)  ## 予測結果にSoftmaxをかける
        argmax_wsd = F.argmax(y_wsd_soft, axis=1)  ## 最大のインデクス値を取ってくる
        cond = chainer.Variable(
            self.xp.array([
                True if i != "<PAD>" else False for i in list(chain(*labels))
            ]))  ## 語義のラベルがついていない単語は無視するための条件
        pad_array = chainer.Variable(
            -1 * self.xp.ones(argmax_wsd.shape, dtype=argmax_wsd.dtype))
        pad_array_argmax_wsd = F.where(cond, argmax_wsd, pad_array)

        sense_label_embed = F.embed_id(x=pad_array_argmax_wsd,
                                       W=self.xp.array(
                                           self.lookup_table_sense_fixed),
                                       ignore_label=-1)  ## 固定.

        sense_label_embed = sense_label_embed.reshape(
            trf_encoded_matrix.shape[0], trf_encoded_matrix.shape[-1], -1)
        origin_shape = sense_label_embed.shape
        sense_label_embed = F.moveaxis(sense_label_embed, 1, 2)

        ## 置き換え ##
        cond_reshape = cond.reshape(cond.shape[0], -1)
        cond_reshape = F.broadcast_to(
            cond_reshape, (cond_reshape.shape[0], trf_encoded_matrix.shape[1]))
        cond_reshape = cond_reshape.reshape(origin_shape)
        cond_reshape = F.swapaxes(cond_reshape, 1, 2)
        replaced_trf_matrix = F.where(cond_reshape, sense_label_embed,
                                      trf_encoded_matrix)

        ### WSDの予測をTCに組み入れる ###
        tc = replaced_trf_matrix  ## 置換後の文書行列

        ### TC ###
        tc_features = F.sum(tc, axis=2)  ## TC特徴
        y_tc = self.fc2(tc_features)  ### TCの予測結果

        return (y_tc, y_wsd) if (self.model_type == "TRF-Multi") or (
            self.model_type == "TRF-Delay-Multi") else y_tc
예제 #30
0
    def compute_context_vector(self, batches=True):
        xp = cuda.cupy if self.gpuid >= 0 else np

        batch_size, n_units = self[self.lstm_dec[-1]].h.shape
        # attention weights for the hidden states of each word in the input list

        if batches:
            # masking pad ids for attention
            weights = F.batch_matmul(self.enc_states,
                                     self[self.lstm_dec[-1]].h)
            weights = F.where(self.mask, weights, self.minf)

            alphas = F.softmax(weights)

            # compute context vector
            cv = F.reshape(F.batch_matmul(F.swapaxes(self.enc_states, 2, 1),
                                          alphas),
                           shape=(batch_size, n_units))
        else:
            # without batches
            alphas = F.softmax(
                F.matmul(self[self.lstm_dec[-1]].h,
                         self.enc_states,
                         transb=True))
            # compute context vector
            if self.attn == SOFT_ATTN:
                cv = F.batch_matmul(self.enc_states, F.transpose(alphas))
                cv = F.transpose(F.sum(cv, axis=0))
            else:
                print("nothing to see here ...")

        return cv, alphas
예제 #31
0
    def __call__(self, h, adj, deg_conds):
        # h    (minibatch, atom, ch)
        # h encodes each atom's info in ch axis of size hidden_dim
        # adjs (minibatch, atom, atom)

        # --- Message part ---
        # Take sum along adjacent atoms

        # fv   (minibatch, atom, ch)
        fv = chainer_chemistry.functions.matmul(adj, h)

        # --- Update part ---
        # s0, s1, s2 = fv.shape
        if self.xp is numpy:
            zero_array = numpy.zeros(fv.shape, dtype=numpy.float32)
        else:
            zero_array = self.xp.zeros_like(fv)

        fvds = [functions.where(cond, fv, zero_array) for cond in deg_conds]

        out_h = 0
        for graph_linear, fvd in zip(self.graph_linears, fvds):
            out_h = out_h + graph_linear(fvd)

        # out_x shape (minibatch, max_num_atoms, hidden_dim)
        out_h = functions.sigmoid(out_h)
        return out_h
예제 #32
0
 def kld(self, vec_true, vec_compare):
     ind = vec_true.data * vec_compare.data > 0
     ind_var = chainer.Variable(ind)
     include_nan = vec_true * F.log(vec_true / vec_compare)
     z = chainer.Variable(np.zeros((len(ind), 1), dtype=np.float32))
     # return np.nansum(vec_true * np.log(vec_true / vec_compare))
     return F.sum(F.where(ind_var, include_nan, z))
예제 #33
0
    def calc_attention(self, xs, ys, genre_exs, gender_exs, attn_linear):

        concat_ys = F.concat(
            ys,
            axis=0)  # -> (total len of batched sentence, word embedding dim)
        attn_ys = attn_linear(F.tanh(concat_ys))
        cond_feature = self.proj_cond(F.concat(
            (genre_exs, gender_exs)))  # -> (batchsize, proj_cond dim)

        cumsum_ys = self.xp.cumsum(
            self.xp.array([len(x) for x in xs], dtype=self.xp.int32))
        split_attn_ys = F.split_axis(attn_ys, cumsum_ys[:-1].tolist(), axis=0)
        split_attn_ys_pad = F.pad_sequence(split_attn_ys, padding=-1024)

        bool_cond = split_attn_ys_pad.array == -1024
        split_attn_ys_pad = split_attn_ys_pad * F.expand_dims(F.broadcast_to(
            cond_feature, (split_attn_ys_pad.shape[:-1])),
                                                              axis=-1)

        padding_array = self.xp.full(split_attn_ys_pad.shape,
                                     -1024,
                                     dtype=self.xp.float32)

        split_attn_ys_pad = F.where(bool_cond, padding_array,
                                    split_attn_ys_pad)

        attn_softmax = F.softmax(split_attn_ys_pad, axis=1)

        return attn_softmax
예제 #34
0
    def forward(self, e_var, s_var=None, mask=None, batch=1):
        """Core function of the Multi-head attention layer.

        Args:
            e_var (chainer.Variable): Variable of input array.
            s_var (chainer.Variable): Variable of source array from encoder.
            mask (chainer.Variable): Attention mask.
            batch (int): Batch size.

        Returns:
            chainer.Variable: Outout of multi-head attention layer.

        """
        xp = self.xp
        if s_var is None:
            # batch, head, time1/2, d_k)
            Q = self.linear_q(e_var).reshape(batch, -1, self.h, self.d_k)
            K = self.linear_k(e_var).reshape(batch, -1, self.h, self.d_k)
            V = self.linear_v(e_var).reshape(batch, -1, self.h, self.d_k)
        else:
            Q = self.linear_q(e_var).reshape(batch, -1, self.h, self.d_k)
            K = self.linear_k(s_var).reshape(batch, -1, self.h, self.d_k)
            V = self.linear_v(s_var).reshape(batch, -1, self.h, self.d_k)
        scores = F.matmul(F.swapaxes(Q, 1, 2), K.transpose(
            0, 2, 3, 1)) / np.sqrt(self.d_k)
        if mask is not None:
            mask = xp.stack([mask] * self.h, axis=1)
            scores = F.where(mask, scores, xp.full(scores.shape, MIN_VALUE,
                                                   "f"))
        self.attn = F.softmax(scores, axis=-1)
        p_attn = F.dropout(self.attn, self.dropout)
        x = F.matmul(p_attn, F.swapaxes(V, 1, 2))
        x = F.swapaxes(x, 1, 2).reshape(-1, self.h * self.d_k)
        return self.linear_out(x)
예제 #35
0
    def __call__(self, hx, cx, xs, enc_hs):
        xs_embed = [self.embed(x) for x in xs]
        hy, cy, ys = self.Nlstm(hx, cx, xs_embed)

        ys_pad = F.pad_sequence(ys, length=None, padding=0.0)
        enc_hs = F.pad_sequence(enc_hs, length=None, padding=0.0)

        mask = self.xp.all(enc_hs.data == 0, axis=2, keepdims=True)
        mask_num = self.xp.full(mask.shape, -1024.0, dtype=self.xp.float32)

        alignment = []
        decode = []

        ys_pad = F.transpose(ys_pad, (1, 0, 2))
        for y in ys_pad:
            y = F.reshape(y, (*y.shape, 1))
            score = F.matmul(enc_hs, y)
            score = F.where(mask, mask_num, score)
            align = F.softmax(score, axis=1)
            context_vector = F.matmul(enc_hs, align, True, False)
            t = self.W_c(
                F.dropout(F.concat((y, context_vector), axis=1), self.dropout))
            ys_proj = self.proj(F.dropout(t, self.dropout))
            alignment.append(F.reshape(align, (len(xs), -1)))
            decode.append(ys_proj)

        decode = F.stack(decode, axis=1)
        alignment = F.stack(alignment, axis=1)
        return hy, cy, decode, alignment.data
예제 #36
0
    def __call__(self, h, adj, deg_conds):
        # h: (minibatch, atom, ch)
        # h encodes each atom's info in ch axis of size hidden_dim
        # adjs: (minibatch, atom, atom)

        # --- Message part ---
        # Take sum along adjacent atoms

        # fv: (minibatch, atom, ch)
        fv = chainer_chemistry.functions.matmul(adj, h)

        # --- Update part ---
        if self.xp is numpy:
            zero_array = numpy.zeros(fv.shape, dtype=numpy.float32)
        else:
            zero_array = self.xp.zeros_like(fv)

        fvds = [functions.where(cond, fv, zero_array) for cond in deg_conds]

        out_h = 0
        for graph_linear, fvd in zip(self.graph_linears, fvds):
            out_h = out_h + graph_linear(fvd)

        # out_h shape (minibatch, max_num_atoms, hidden_dim)
        out_h = functions.sigmoid(out_h)
        return out_h
예제 #37
0
파일: model.py 프로젝트: musyoku/NLP
	def __call__(self, x, condition=None):
		lstm_in = self.upward(x)
		if self.h is not None:
			lstm_in += self.lateral(self.h)
		if self.c is None:
			xp = self.xp
			self.c = Variable(xp.zeros((len(x.data), self.state_size), dtype=x.data.dtype),	volatile="auto")
		if condition is None:
			self.c, self.h = F.lstm(self.c, lstm_in)
		else:
			c, h = F.lstm(self.c, lstm_in)
			if self.h is None:
				self.h = h
				self.c = c
			else:
				self.h = F.where(condition, h, self.h)
				self.c = F.where(condition, c, self.c)
		return self.h
예제 #38
0
 def f(x, rois, roi_indices):
     y = functions.roi_max_align_2d(
         x, rois, roi_indices, outsize=self.outsize,
         spatial_scale=self.spatial_scale,
         sampling_ratio=self.sampling_ratio)
     xp = cuda.get_array_module(y)
     y = functions.where(
         xp.isinf(y.array), xp.zeros(y.shape, dtype=y.dtype), y)
     return y
예제 #39
0
 def __accuracy(self, y, t):
     xp = self.xp
     b, c, n = y.data.shape
     v = np.arange(c, dtype=np.float32).reshape((1, -1, 1)).repeat(b, axis=0).repeat(n, axis=2)
     v = Variable(xp.asarray(v), volatile=True)
     r = F.sum(v * F.softmax(Variable(y.data, volatile=True)), axis=1)
     c = Variable(t.data >= 0, volatile=True)
     t = Variable(t.data.astype(np.float32), volatile=True)
     r = F.where(c, r, t)
     return F.sum(((r - t) * self.rating_unit) ** 2)
예제 #40
0
파일: test_where.py 프로젝트: hvy/chainer
    def check_forward(self, c_data, x_data, y_data):
        c = chainer.Variable(c_data)
        x = chainer.Variable(x_data)
        y = chainer.Variable(y_data)

        z = functions.where(c, x, y)

        xp = c.xp
        z_data_expected = xp.where(c_data, x_data, y_data)
        testing.assert_allclose(z.array, z_data_expected)
예제 #41
0
    def check_forward(self, c_data, x_data, y_data):
        c = chainer.Variable(c_data)
        x = chainer.Variable(x_data)
        y = chainer.Variable(y_data)

        z = functions.where(c, x, y)

        self.assertEqual(x.data.shape, z.data.shape)

        for i in numpy.ndindex(c.data.shape):
            if c.data[i]:
                self.assertEqual(x.data[i], z.data[i])
            else:
                self.assertEqual(y.data[i], z.data[i])
예제 #42
0
    def check_forward(self, c_data, x_data, y_data):
        c = chainer.Variable(c_data)
        x = chainer.Variable(x_data)
        y = chainer.Variable(y_data)

        z = F.where(c, x, y)

        self.assertEqual(x.data.shape, z.data.shape)

        for c, x, y, z in zip(c.data.flatten(), x.data.flatten(),
                              y.data.flatten(), z.data.flatten()):
            if c:
                self.assertEqual(x, z)
            else:
                self.assertEqual(y, z)
예제 #43
0
파일: model.py 프로젝트: musyoku/NLP
	def __call__(self, x, condition=None):

		if self.h is None:
			z_t = sgu.hard_sigmoid(self.W_xz(x))
			h_t = z_t * 0.5
		else:
			h_t = sgu.DSGU.__call__(self, self.h, x)

		if condition is None:
			self.h = h_t
		else:
			if self.h is None:
				self.h = h_t
			else:
				self.h = F.where(condition, h_t, self.h)
		return h_t
예제 #44
0
    def _attention(self, h_forward, h_backword, s, enable, disable_value):
        batch_size = s.shape[0]
        sentence_size = len(h_forward)
        hidden_size = self.hidden_size
        xp = self.xp

        weighted_s = F.broadcast_to(F.expand_dims(self.W_a(s), axis=1),
                                    (batch_size, sentence_size, hidden_size))
        h = F.concat((F.concat(h_forward, axis=0), F.concat(h_backword, axis=0)))
        weighted_h = F.reshape(self.U_a(h), (batch_size, sentence_size, hidden_size))

        e = self.v_a(F.reshape(F.tanh(weighted_s + weighted_h),
                               (batch_size * sentence_size, hidden_size)))
        e = F.where(enable, F.reshape(e, (batch_size, sentence_size)), disable_value)
        alpha = F.softmax(e)
        c = F.batch_matmul(F.reshape(h, (batch_size, 2 * hidden_size, sentence_size)), alpha)
        return F.reshape(c, (batch_size, 2 * hidden_size))
예제 #45
0
    def check_backward(self, c_data, x_data, y_data, g_data):
        c = chainer.Variable(c_data)
        x = chainer.Variable(x_data)
        y = chainer.Variable(y_data)

        z = F.where(c, x, y)
        z.grad = g_data

        z.backward()

        func = z.creator
        f = lambda: func.forward((c.data, x.data, y.data))

        gx, gy = gradient_check.numerical_grad(f, (x_data, y.data), (g_data,))
        gradient_check.assert_allclose(gx, x.grad)
        gradient_check.assert_allclose(gy, y.grad)
        self.assertIs(c.grad, None)
예제 #46
0
파일: model.py 프로젝트: musyoku/NLP
	def __call__(self, x, condition=None):
		z = self.W_z(x)
		h_bar = self.W(x)
		if self.h is not None:
			r = F.sigmoid(self.W_r(x) + self.U_r(self.h))
			z += self.U_z(self.h)
			h_bar += self.U(r * self.h)
		z = F.sigmoid(z)
		h_bar = F.tanh(h_bar)

		h_new = z * h_bar
		if self.h is not None:
			h_new += (1 - z) * self.h
		if condition is None:
			self.h = h_new
		else:
			if self.h is None:
				self.h = h_new
			else:
				self.h = F.where(condition, h_new, self.h)
		return self.h
예제 #47
0
    def __call__(self, h, adj):
        xp = self.xp
        # (minibatch, atom, channel)
        mb, atom, ch = h.shape
        # (minibatch, atom, EDGE_TYPE * heads * out_dim)
        h = self.message_layer(h)
        # (minibatch, atom, EDGE_TYPE, heads, out_dim)
        h = functions.reshape(h, (mb, atom, self.n_edge_types, self.n_heads,
                                  self.out_channels))
        # concat all pairs of atom
        # (minibatch, 1, atom, heads, out_dim)
        h_i = functions.reshape(h, (mb, 1, atom, self.n_edge_types,
                                    self.n_heads, self.out_channels))
        # (minibatch, atom, atom, heads, out_dim)
        h_i = functions.broadcast_to(h_i, (mb, atom, atom, self.n_edge_types,
                                           self.n_heads, self.out_channels))

        # (minibatch, atom, 1, EDGE_TYPE, heads, out_dim)
        h_j = functions.reshape(h, (mb, atom, 1, self.n_edge_types,
                                    self.n_heads, self.out_channels))
        # (minibatch, atom, atom, EDGE_TYPE, heads, out_dim)
        h_j = functions.broadcast_to(h_j, (mb, atom, atom, self.n_edge_types,
                                           self.n_heads, self.out_channels))

        # (minibatch, atom, atom, EDGE_TYPE, heads, out_dim * 2)
        e = functions.concat([h_i, h_j], axis=5)

        # (minibatch, EDGE_TYPE, heads, atom, atom, out_dim * 2)
        e = functions.transpose(e, (0, 3, 4, 1, 2, 5))
        # (minibatch * EDGE_TYPE * heads, atom * atom, out_dim * 2)
        e = functions.reshape(e, (mb * self.n_edge_types * self.n_heads,
                                  atom * atom, self.out_channels * 2))
        # (minibatch * EDGE_TYPE * heads, atom * atom, 1)
        e = self.attention_layer(e)

        # (minibatch, EDGE_TYPE, heads, atom, atom)
        e = functions.reshape(e, (mb, self.n_edge_types, self.n_heads, atom,
                                  atom))
        e = functions.leaky_relu(e, self.negative_slope)

        # (minibatch, EDGE_TYPE, atom, atom)
        if isinstance(adj, chainer.Variable):
            cond = adj.array.astype(xp.bool)
        else:
            cond = adj.astype(xp.bool)
        # (minibatch, EDGE_TYPE, 1, atom, atom)
        cond = xp.reshape(cond, (mb, self.n_edge_types, 1, atom, atom))
        # (minibatch, EDGE_TYPE, heads, atom, atom)
        cond = xp.broadcast_to(cond, e.array.shape)
        # TODO(mottodora): find better way to ignore non connected
        e = functions.where(cond, e,
                            xp.broadcast_to(xp.array(-10000), e.array.shape)
                            .astype(xp.float32))
        # In Relational Graph Attention Networks eq.(7)
        # ARGAT: take the softmax over the logits across node neighborhoods
        # irrespective of relation
        if self.softmax_mode == 'across':
            # (minibatch, heads, atom, EDGE_TYPE, atom)
            e = functions.transpose(e, (0, 2, 3, 1, 4))
            # (minibatch, heads, atom, EDGE_TYPE * atom)
            e = functions.reshape(e, (mb, self.n_heads, atom,
                                      self.n_edge_types * atom))
            # (minibatch, heads, atom, EDGE_TYPE * atom)
            alpha = functions.softmax(e, axis=3)
            if self.dropout_ratio >= 0:
                alpha = functions.dropout(alpha, ratio=self.dropout_ratio)
            # (minibatch, heads, atom, EDGE_TYPE, atom)
            alpha = functions.reshape(alpha, (mb, self.n_heads, atom,
                                              self.n_edge_types, atom))
            # (minibatch, EDGE_TYPE, heads, atom, atom)
            alpha = functions.transpose(alpha, (0, 3, 1, 2, 4))

        # In Relational Graph Attention Networks eq.(6)
        # WIRGAT: take the softmax over the logits independently for each
        # relation
        elif self.softmax_mode == 'within':
            alpha = functions.softmax(e, axis=4)
            if self.dropout_ratio >= 0:
                alpha = functions.dropout(alpha, ratio=self.dropout_ratio)
        else:
            raise ValueError("{} is invalid. Please use 'across' or 'within'"
                             .format(self.softmax_mode))

        # before: (minibatch, atom, EDGE_TYPE, heads, out_dim)
        # after: (minibatch, EDGE_TYPE, heads, atom, out_dim)
        h = functions.transpose(h, (0, 2, 3, 1, 4))
        # (minibatch, EDGE_TYPE, heads, atom, out_dim)
        h_new = functions.matmul(alpha, h)
        # (minibatch, heads, atom, out_dim)
        h_new = functions.sum(h_new, axis=1)
        if self.concat_heads:
            # (heads, minibatch, atom, out_dim)
            h_new = functions.transpose(h_new, (1, 0, 2, 3))
            # (minibatch, atom, heads * out_dim)
            h_new = functions.concat(h_new, axis=2)
        else:
            # (minibatch, atom, out_dim)
            h_new = functions.mean(h_new, axis=1)
        return h_new
예제 #48
0
 def forward(self, inputs, devices):
     c, x, y = inputs
     z = functions.where(c, x, y)
     return z,
예제 #49
0
def forward(src_sentence, trg_sentence, model, training=True):
    end = out_dim
    # 単語IDへの変換(自分で適当に実装する)
    # 正解の翻訳には終端記号を追加しておく。
    #src_sentence = [convert_to_your_src_id(word) for word in src_sentence]
    #trg_sentence = [convert_to_your_trg_id(word) for wprd in trg_sentence]
    
    # LSTM内部状態の初期値
    c_prev = Variable(np.zeros((10, HIDDEN_SIZE), dtype=np.float32))
    p_prev = Variable(np.zeros((10, HIDDEN_SIZE), dtype=np.float32))
    i = Variable(np.zeros((10, SRC_EMBED_SIZE), dtype=np.float32))
    # エンコーダ
    for word in reversed(src_sentence):
        word = np.array(word,dtype=np.int32)
        word = word.reshape(10,1)
        x = Variable(np.array(word, dtype=np.int32))
        i = model.w_xi(word)
        c, p = lstm(c_prev, model.w_ip(i) + model.w_pp(p_prev))
        enable = np.asarray([[(x_i != -1) for i in range(HIDDEN_SIZE)] for x_i in x.data.reshape(10,)])
        enable = Variable(enable)
        _c = []
        _p = []
        for i in range(BATCH_SIZE):
            _ = where(enable[i], c[i], c_prev[i])
            _c.append(_.data)
        for i in range(BATCH_SIZE):
            _ = where(enable[i], p[i].data, p_prev[i].data)
            _p.append(_.data)
        c_prev = Variable(np.asarray(_c,dtype = np.float32))
        p_prev = Variable(np.asarray(_p,dtype = np.float32))
    # エンコーダ -> デコーダ
    c, q = lstm(c, model.w_pq(p))
    # デコーダ
    if training:
        # 学習時はyとして正解の翻訳を使い、forwardの結果として累積損失を返す。
        accum_loss = 0
        for word in trg_sentence:
            j = tanh(model.w_qj(q))
            y = model.w_jy(j)
            #y = functions.reshape(y,(1,1,TRG_VOCAB_SIZE))
            #_t = np.zeros(TRG_VOCAB_SIZE,dtype = np.int32)
            #_t[word] = 1
            t = np.asarray(word, dtype= np.int32)
            #t = t.reshape(1,BATCH_SIZE)
            t = Variable(t)
            accum_loss += softmax_cross_entropy(y,t)
            c, q = lstm(c, model.w_yq(t) +  model.w_qq(q))
        return accum_loss
    else:
        # 予測時には翻訳器が生成したyを次回の入力に使い、forwardの結果として生成された単語列を返す。
        # yの中で最大の確率を持つ単語を選択していくが、softmaxを取る必要はない。
        hyp_sentence = []
        while len(hyp_sentence) < 100: # 100単語以上は生成しないようにする
            j = tanh(model.w_qj(q))
            y = model.w_jy(j)
            word = y.data.argmax(1)[0]
            if word == END_OF_SENTENCE:
                break # 終端記号が生成されたので終了
            hyp_sentence.append(convert_to_your_trg_str(word))
            c, q = lstm(c, model.w_yq(y), model.w_qq(q))
        return hyp_sentence
예제 #50
0
def forward(model, batch, num_samples, word_keep_rate, UNK, train=True):
    
    batch_size = batch.shape[0]

    xp = model.xp
    use_gpu = (xp == cuda.cupy)
    if use_gpu:
        batch = cuda.to_gpu(batch)

    model.reset_state()
    model.zerograds()

    # encode
    batch_length = len(batch[0])-1
    for i in range(batch_length):
        w = Variable(batch[:, i])
        model.encode(w, train=train)
    
    # infer q(z|x)
    model.infer(train=train)

    # compute KL
    KL = 0
    for i in range(model.num_layers):
        # h
        mu, sigma = model.hmus[i], model.hsigmas[i]
        KL += -F.sum((1 + 2 * F.log(sigma) - sigma*sigma - mu*mu) / 2)

        # c
        mu, sigma = model.cmus[i], model.csigmas[i]
        KL += -F.sum((1 + 2 * F.log(sigma) - sigma*sigma - mu*mu) / 2)

    KL /= batch_size
    # draw and decode
    cross_entropies = []
    if not train:
        ys, ts = [], []

    UNKs = np.array([UNK for _ in range(batch_size)], dtype=np.int32)
    if use_gpu:
        UNKs = cuda.to_gpu(UNKs)
    for _ in range(num_samples):

        cross_entropies.append(0)
        if not train:
            ys.append([])
            ts.append([])

        if train == True:
            model.set_by_sample(train=train)
        else:
            model.set_by_MLE(train=train)

        last_w = None
        for i in range(batch_length):
            w, next_w = Variable(batch[:, i]), Variable(batch[:, i+1])
            # word dropout
            masked_w = batch[:, i]
            if np.random.uniform() > word_keep_rate:
                enable = (masked_w != -1)
                masked_w = F.where(enable, masked_w, UNKs)
            y = model.decode(masked_w, train=train)
            cross_entropies[-1] += F.softmax_cross_entropy(y, next_w)
            if not train:
                ys[-1].append(xp.argmax(y.data, axis=1))
                ts[-1].append(next_w.data)
            last_w = next_w

        if not train:
            ys[-1] = xp.vstack(ys[-1]).T
            ts[-1] = xp.vstack(ts[-1]).T
            if use_gpu:
                ys[-1] = cuda.to_cpu(ys[-1])
                ts[-1] = cuda.to_cpu(ts[-1])

    if train:
        return (KL, cross_entropies)
    else:
        assert(len(cross_entropies) == 1 and len(ys) == 1 and len(ts) == 1)
        return (KL, (cross_entropies, ys, ts))
예제 #51
0
def softmax(x, mask, zero_pad, axis):
    x_explogsoftmax = F.exp(logsoftmax_no_mask(x, mask, zero_pad, axis))
    return F.where(mask, x_explogsoftmax, zero_pad)
예제 #52
0
def logsoftmax(x, mask, zero_pad, axis):
    return F.where(mask, logsoftmax_no_mask(x, mask, zero_pad, axis), zero_pad)
예제 #53
0
def logsumexp(x, mask, zero_pad, axis):
    x_exp = F.where(mask, F.exp(x), zero_pad)
    return F.log(F.sum(x_exp, axis=axis))