def argmax_crf1d(cost, xs): alpha = xs[0] alphas = [] max_inds = [] for x in xs[1:]: batch = x.shape[0] if alpha.shape[0] > batch: alpha, alpha_rest = split_axis.split_axis(alpha, [batch], axis=0) alphas.append(alpha_rest) else: alphas.append(None) b_alpha, b_cost = broadcast.broadcast(alpha[..., None], cost) scores = b_alpha + b_cost max_ind = minmax.argmax(scores, axis=1) max_inds.append(max_ind) alpha = minmax.max(scores, axis=1) + x inds = minmax.argmax(alpha, axis=1) path = [inds.data] for m, a in zip(max_inds[::-1], alphas[::-1]): inds = select_item.select_item(m, inds) if a is not None: inds = concat.concat([inds, minmax.argmax(a, axis=1)], axis=0) path.append(inds.data) path.reverse() score = minmax.max(alpha, axis=1) for a in alphas[::-1]: if a is None: continue score = concat.concat([score, minmax.max(a, axis=1)], axis=0) return score, path
def argmax_crf1d(cost, xs): """Computes a state that maximizes a joint probability of the given CRF. Args: cost (Variable): A :math:`K \\times K` matrix which holds transition cost between two labels, where :math:`K` is the number of labels. xs (list of Variable): Input vector for each label. ``len(xs)`` denotes the length of the sequence, and each :class:`~chainer.Variable` holds a :math:`B \\times K` matrix, where :math:`B` is mini-batch size, :math:`K` is the number of labels. Note that :math:`B`\\ s in all the variables are not necessary the same, i.e., it accepts the input sequences with different lengths. Returns: tuple: A tuple of :class:`~chainer.Variable` object ``s`` and a :class:`list` ``ps``. The shape of ``s`` is ``(B,)``, where ``B`` is the mini-batch size. i-th element of ``s``, ``s[i]``, represents log-likelihood of i-th data. ``ps`` is a list of :class:`numpy.ndarray` or :class:`cupy.ndarray`, and denotes the state that maximizes the point probability. ``len(ps)`` is equal to ``len(xs)``, and shape of each ``ps[i]`` is the mini-batch size of the corresponding ``xs[i]``. That means, ``ps[i].shape == xs[i].shape[0:1]``. """ alpha = xs[0] alphas = [] max_inds = [] for x in xs[1:]: batch = x.shape[0] if alpha.shape[0] > batch: alpha, alpha_rest = split_axis.split_axis(alpha, [batch], axis=0) alphas.append(alpha_rest) else: alphas.append(None) b_alpha, b_cost = broadcast.broadcast(alpha[..., None], cost) scores = b_alpha + b_cost max_ind = minmax.argmax(scores, axis=1) max_inds.append(max_ind) alpha = minmax.max(scores, axis=1) + x inds = minmax.argmax(alpha, axis=1) path = [inds.data] for m, a in zip(max_inds[::-1], alphas[::-1]): inds = select_item.select_item(m, inds) if a is not None: inds = concat.concat([inds, minmax.argmax(a, axis=1)], axis=0) path.append(inds.data) path.reverse() score = minmax.max(alpha, axis=1) for a in alphas[::-1]: if a is None: continue score = concat.concat([score, minmax.max(a, axis=1)], axis=0) return score, path
def argmax_crf1d(cost, xs): alpha = xs[0] max_inds = [] for x in xs[1:]: b_alpha, b_cost = broadcast.broadcast(alpha[..., None], cost) scores = b_alpha + b_cost max_ind = minmax.argmax(scores, axis=1) max_inds.append(max_ind) alpha = minmax.max(scores, axis=1) + x inds = minmax.argmax(alpha, axis=1) path = [inds.data] for m in reversed(max_inds): inds = select_item.select_item(m, inds) path.append(inds.data) path.reverse() return minmax.max(alpha, axis=1), path
def predict_labels(self, m, h, xp=np): mh = F.concat((m, h), 1) scores = self.mlp_label(mh, per_element=False) yl = minmax.argmax(scores, axis=1).data if xp is cuda.cupy: yl = cuda.to_cpu(yl) yl = np.insert(yl, 0, np.int32(-1)) return scores, yl
def predict_pos(self, w, xp=np): x = self.unigram_embed(w) scores = self.mlp_pos(x, per_element=False) yp = minmax.argmax(scores, axis=1).data if xp is cuda.cupy: yp = cuda.to_cpu(yp) yp = np.insert(yp, 0, np.int32(-1)) return scores, yp
def predict_arcs(self, m, h, train=True, xp=np): scores = self.biaffine_arc( F.dropout(m, self.pred_layers_dropout), F.dropout(h, self.pred_layers_dropout)) + masking_matrix( len(m), self.n_dummy, xp=xp) yh = minmax.argmax(scores, axis=1).data if xp is cuda.cupy: yh = cuda.to_cpu(yh) # if not train: # not_tree = detect_cycle(yh) # if not_tree: # yh_mst = mst(scores) # yh = yh_mst # conflict = False # for yi, ymi in zip(yh, yh_mst): # if yi != ymi: # conflict = True # break # print('\n{} {}'.format(not_tree, conflict)) # print(yh) # print(yh_mst) # print(scores.data) # p = np.zeros((len(yh), len(yh)+1)) # for i, yi in enumerate(yh): # p[i][yi] = 1 # print(p) for i in range(self.n_dummy): yh = np.insert(yh, 0, np.int32(constants.NO_PARENTS_ID)) return scores, yh
def act_and_merge_features(self, xs, ws, vs, ms, gcs=None, get_att_score=False): hs = [] pcs = [] ass = [] # attention scores xp = cuda.get_array_module(xs[0]) closs = chainer.Variable(xp.array(0, dtype='f')) if gcs is None: gcs = [None] * len(xs) for x, w, v, gc, mask in zip(xs, ws, vs, gcs, ms): # print('x', x.shape) if w is None and v is None: # no words were found for devel/test data a = xp.zeros((len(x), self.chunk_embed_out_dim), dtype='f') pc = np.zeros(len(x), 'i') pcs.append(pc) h = F.concat((x, a), axis=1) # (n, dt) @ (n, dc) => (n, dt+dc) hs.append(h) continue if w is not None: w = F.dropout(w, self.embed_dropout) ## calculate weight for w mask_ij = mask[0] if self.use_attention: # wavg or wcon mask_i = mask[1] # print('w', w.shape) w_scores = self.biaffine( F.dropout(x, self.biaffine_dropout), F.dropout(w, self.biaffine_dropout)) # (n, m) w_scores = w_scores + mask_ij # a masked element becomes 0 after softmax operation w_weight = F.softmax(w_scores) w_weight = w_weight * mask_i # raw of char w/o no candidate words become a 0 vector # print('ww', w_weight.shape, '\n', w_weight) elif self.chunk_pooling_type == constants.AVG: w_weight = self.normalize(mask_ij, xp=xp) if not self.use_concat and self.chunk_vector_dropout > 0: mask_drop = xp.ones(w_weight.shape, dtype='f') for i in range(w_weight.shape[0]): if self.chunk_vector_dropout > np.random.rand(): mask_drop[i] = xp.zeros(w_weight.shape[1], dtype='f') w_weight = w_weight * mask_drop ## calculate weight for v if self.use_concat: mask_ik = mask[2] n = x.shape[0] wd = self.chunk_embed_dim_merged #w.shape[1] if self.chunk_pooling_type == constants.WCON: ikj_table = mask[3] v_weight0 = F.concat( [ F.expand_dims( # (n, m) -> (n, k) F.get_item(w_weight[i], ikj_table[i]), axis=0) for i in range(n) ], axis=0) # print('mask_ik', mask_ik.shape, '\n', mask_ik) # print('v_weight0', v_weight0.shape, '\n', v_weight0) v_weight0 *= mask_ik # print('ikj_table', ikj_table) else: v_weight0 = mask_ik v_weight = F.transpose(v_weight0) # (n,k) v_weight = F.expand_dims(v_weight, 2) # (k,n) v_weight = F.broadcast_to( v_weight, (self.chunk_concat_num, n, wd)) # (k,n,wd) v_weight = F.concat(v_weight, axis=1) # (k,n*wd) if self.chunk_vector_dropout > 0: mask_drop = xp.ones(v_weight.shape, dtype='f') for i in range(v_weight.shape[0]): if self.chunk_vector_dropout > np.random.rand(): mask_drop[i] = xp.zeros(v_weight.shape[1], dtype='f') v_weight *= mask_drop ## calculate summary vector a if self.use_average: # avg or wavg a = F.matmul(w_weight, w) # (n, m) * (m, dc) => (n, dc) else: # con or wcon v = F.concat(v, axis=1) a = v * v_weight # print('a', a.shape, a) ## get predicted (attended) chunks if self.use_attention: # wavg or wcon if self.chunk_pooling_type == constants.WAVG: weight = w_weight else: weight = v_weight0 pc = minmax.argmax(weight, axis=1).data if xp is cuda.cupy: pc = cuda.to_cpu(pc) pcs.append(pc) # if get_att_score: # ascore = minmax.max(weight, axis=1).data # ass.append(ascore) # ncand = [sum([1 if val >= 0 else 0 for val in raw]) for raw in _mask] # print('pred', pc) # print('gold', gc) # print('ncand', ncand) # print('weight', weight.shape, weight.data) # print('weight') # for i, e in enumerate(weight.data): # print(i, e) h = F.concat((x, a), axis=1) # (n, dt) @ (n, dc) => (n, dt+dc) hs.append(h) if closs.data == 0: closs = None else: closs /= len(xs) if get_att_score: return closs, pcs, hs, ass else: return closs, pcs, hs