def biaffine(self, dep_arc, dep_rel, head_arc, head_rel, mask, arc_targets, rel_targets, blend): is_train = autograd.is_training() batch_size = mask.shape[1] seq_len = mask.shape[0] W_arc = self.arc_W.data() arc_logits: nd.NDArray = bilinear(dep_arc, W_arc, head_arc, self.mlp_arc_size, seq_len, batch_size, num_outputs=1, bias_x=True, bias_y=False) if blend is not None: arc_logits = arc_logits + blend # (#head x #dep) x batch_size flat_arc_logits = reshape_fortran(arc_logits, (seq_len, seq_len * batch_size)) # (#head ) x (#dep x batch_size) arc_preds = nd.greater(arc_logits, 0) # sigmoid y > 0.5 when x > 0 if is_train or arc_targets is not None: arc_correct = arc_preds.asnumpy() * arc_targets arc_accuracy = np.sum(arc_correct) / np.sum(arc_targets * mask) # targets_1D = flatten_numpy(arc_targets) # losses = self.softmax_loss(flat_arc_logits, nd.array(targets_1D)) flat_arc_targets = reshape_fortran(arc_targets, (seq_len, seq_len * batch_size)) losses = self.binary_ce_loss(flat_arc_logits, nd.array(flat_arc_targets)) if is_train or arc_targets is not None: mask_1D_tensor = nd.array(flatten_numpy(mask)) arc_loss = nd.sum(losses * mask_1D_tensor) / mask_1D_tensor.sum() # return arc_accuracy, 0, 0, arc_loss W_rel = self.rel_W.data() rel_logits: nd.NDArray = bilinear(dep_rel, W_rel, head_rel, self.mlp_rel_size, seq_len, batch_size, num_outputs=self._vocab.rel_size, bias_x=True, bias_y=True) # #head x rel_size x #dep x batch_size flat_rel_logits = reshape_fortran(rel_logits.transpose([1, 0, 2, 3]), (self._vocab.rel_size, seq_len * seq_len * batch_size)) # rel_size x (#head x #dep x batch_size) if is_train or arc_targets is not None: mask_rel: nd.NDArray = reshape_fortran(nd.array(mask * arc_targets), (1, seq_len * seq_len * batch_size)) flat_rel_preds = flat_rel_logits.argmax(0) flat_rel_target = nd.array(reshape_fortran(rel_targets, (1, seq_len * seq_len * batch_size))).squeeze( axis=0) rel_correct = nd.equal(flat_rel_preds, flat_rel_target).asnumpy() rel_correct = rel_correct * flatten_numpy(arc_targets * mask) rel_accuracy = np.sum(rel_correct) / np.sum(arc_targets * mask) losses = self.softmax_loss(flat_rel_logits, flat_rel_target) rel_loss = nd.sum(losses * mask_rel) / mask_rel.sum() if is_train or arc_targets is not None: loss = arc_loss + rel_loss if is_train: return arc_accuracy, rel_accuracy, loss outputs = [] rel_preds = rel_logits.transpose([1, 0, 2, 3]).argmax(0) arc_preds = arc_preds.transpose([2, 0, 1]) rel_preds = rel_preds.transpose([2, 0, 1]) for msk, arc_pred, rel_pred in zip(np.transpose(mask), arc_preds, rel_preds): # parse sentences one by one msk[0] = 1. sent_len = int(np.sum(msk)) arc_pred = arc_pred[:sent_len, :sent_len] outputs.append((arc_pred[:sent_len, :sent_len], arc_pred * rel_pred[:sent_len, :sent_len])) return outputs
def balance_sampler(samples): """ignore extra negative samples to keep batch balance""" num_pos = nd.sum(samples == 1, axis=0) num_neg = nd.sum(samples == 0, axis=0) drop_prob = (num_neg - num_pos) / num_neg drop_prob = nd.where(nd.lesser(drop_prob, 0), nd.zeros_like(drop_prob), drop_prob) mask = nd.where( nd.greater( nd.random.uniform(0, 1, shape=samples.shape, ctx=samples.context), drop_prob), nd.ones_like(samples), nd.zeros_like(samples)) mask = nd.where(nd.equal(samples, 1), samples, mask) return mask
def batch_process(seq, ctx): seq = np.array(seq) aligned_seq = np.zeros( (max_sequence_length - 2 * region_radius, batch_size, region_size)) for i in range(region_radius, max_sequence_length - region_radius): aligned_seq[i - region_radius] = seq[:, i - region_radius:i - region_radius + region_size] aligned_seq = nd.array(aligned_seq, ctx) batch_sequence = nd.array(seq, ctx) trimed_seq = batch_sequence[:, region_radius:max_sequence_length - region_radius] mask = nd.broadcast_axes(nd.greater(trimed_seq, 0).reshape( (batch_size, -1, 1)), axis=2, size=128) return aligned_seq, nd.array(trimed_seq, ctx), mask
def forward(self, src_idx, tgt_idx): # compute encoder mask key_mask = self._get_key_mask(src_idx, src_idx, pad_idx=self.src_pad_idx) src_non_pad_mask = self._get_non_pad_mask(src_idx, pad_idx=self.src_pad_idx) # compute decoder mask self_tril_mask = self._get_self_tril_mask(tgt_idx) self_key_mask = self._get_key_mask(tgt_idx, tgt_idx, pad_idx=self.tgt_pad_idx) self_att_mask = nd.greater((self_key_mask + self_tril_mask), 1) context_att_mask = self._get_key_mask(src_idx, tgt_idx, pad_idx=self.src_pad_idx) tgt_non_pad_mask = self._get_non_pad_mask(tgt_idx, pad_idx=self.tgt_pad_idx) # Encoder position = nd.array(self._position_encoding_init( src_idx.shape[1], self._model_dim), ctx=src_idx.context) position = nd.expand_dims(position, axis=0) position = nd.broadcast_axes(position, axis=0, size=tgt_idx.shape[0]) position = position * src_non_pad_mask src_emb = self.embedding(src_idx) enc_output = self.encoder(src_emb, position, key_mask, src_non_pad_mask) # Decoder position = nd.array(self._position_encoding_init( tgt_idx.shape[1], self._model_dim), ctx=src_idx.context) position = nd.expand_dims(position, axis=0) position = nd.broadcast_axes(position, axis=0, size=tgt_idx.shape[0]) position = position * tgt_non_pad_mask tgt_emb = self.embedding(tgt_idx) outputs = self.decoder(enc_output, tgt_emb, position, self_att_mask, context_att_mask, tgt_non_pad_mask) outputs = self.linear(outputs) return outputs
def forward(self, pred, target): batch_size = target.shape[0] label_size = target.shape[1] ## rank weight to sample and rank_weights = self.rank_weights max_num_trials = target.shape[1] - 1 pos_mask = nd.greater(target, 0).asnumpy() neg_mask = nd.equal(target, 0).asnumpy() L = nd.zeros_like(pred) for i in range(batch_size): for j in range(label_size): if target[i, j] == 1: ##initialization sample_score_margin = -1 num_trials = 0 while ((sample_score_margin < 0) and (num_trials < max_num_trials)): neg_labels_idx = np.array([ idx for idx, v in enumerate(target[i, :]) if v == 0 ]) if len(neg_labels_idx) > 0: neg_idx = np.random.choice(neg_labels_idx, replace=False) sample_score_margin = pred[i, neg_idx] - pred[i, j] num_trials += 1 else: num_trials = 1 pass ## how many trials determin the weight r_j = int(np.floor(max_num_trials / num_trials)) L[i, j] = rank_weights[r_j] #print("L weight",L) loss = nd.sum( L * (nd.sum(1 - nd.array(pos_mask).as_in_context(pred.context) * pred + nd.array(neg_mask).as_in_context(pred.context) * pred, axis=1, keepdims=True))) self.save_for_backward(L, pos_mask, neg_mask) return loss
def batch_process(seq, isContextWord, ctx): seq = np.array(seq) aligned_seq = np.zeros( (max_sequence_length - 2 * region_radius, batch_size, region_size)) for i in range(region_radius, max_sequence_length - region_radius): aligned_seq[i - region_radius] = seq[:, i - region_radius:i - region_radius + region_size] if isContextWord: unit_id_bias = np.array([i * vocab_size for i in range(region_size)]) aligned_seq = aligned_seq.transpose((1, 0, 2)) + unit_id_bias aligned_seq = nd.array(aligned_seq, ctx) batch_sequence = nd.array(seq, ctx) trimed_seq = batch_sequence[:, region_radius:max_sequence_length - region_radius] mask = nd.broadcast_axes(nd.greater(trimed_seq, 0).reshape( (batch_size, -1, 1)), axis=2, size=128) return aligned_seq, nd.array(trimed_seq, ctx), mask
def get_max_pred(batch_heatmaps): batch_size = batch_heatmaps.shape[0] num_joints = batch_heatmaps.shape[1] width = batch_heatmaps.shape[3] heatmaps_reshaped = batch_heatmaps.reshape((batch_size, num_joints, -1)) idx = nd.argmax(heatmaps_reshaped, 2) maxvals = nd.max(heatmaps_reshaped, 2) maxvals = maxvals.reshape((batch_size, num_joints, 1)) idx = idx.reshape((batch_size, num_joints, 1)) preds = nd.tile(idx, (1, 1, 2)).astype(np.float32) preds[:, :, 0] = (preds[:, :, 0]) % width preds[:, :, 1] = nd.floor((preds[:, :, 1]) / width) pred_mask = nd.tile(nd.greater(maxvals, 0.0), (1, 1, 2)) pred_mask = pred_mask.astype(np.float32) preds *= pred_mask return preds, maxvals
def forward(self, en_bert_output, en_idx, ch_idx): self_tril_mask = self._get_self_tril_mask(ch_idx) self_key_mask = self._get_key_mask(ch_idx, ch_idx, pad_idx=self.ch_pad_idx) self_att_mask = nd.greater((self_key_mask + self_tril_mask), 1) context_att_mask = self._get_key_mask(en_idx, ch_idx, pad_idx=self.en_pad_idx) non_pad_mask = self._get_non_pad_mask(ch_idx, pad_idx=self.ch_pad_idx) position = nd.array(self._position_encoding_init( ch_idx.shape[1], self._model_dim), ctx=self._ctx) position = nd.expand_dims(position, axis=0) position = nd.broadcast_axes(position, axis=0, size=ch_idx.shape[0]) position = position * non_pad_mask ch_emb = self.ch_embedding(ch_idx) outputs = self.decoder(en_bert_output, ch_emb, position, self_att_mask, context_att_mask, non_pad_mask) outputs = self.linear(outputs) return outputs
# test for Lsep loss function implementaion Lsep_func = LSEP_funcLoss() if use_identity: pred = nd.array([[0.9, 0.4, 0.5, 0.2], [0.1, 0.6, 0.2, 0.8]]) target = nd.array([[1, 1, 0, 0], [0, 1, 0, 1]]) pred.attach_grad() with autograd.record(): loss = Lsep_func(pred, target) loss.backward(mxnet.nd.ones_like(loss)) print("lsep loss with function ", loss) print(pred.grad) warp_loss = WarpLoss(label_size=63) pred = nd.random.normal(shape=(10, 63)) target = nd.random.normal(shape=(10, 63)) target = nd.greater(target, 0.2) use_identity = True if use_identity: pred = nd.array([[0.9, 0.4, 0.5, 0.2], [0.1, 0.6, 0.2, 0.8]]) target = nd.array([[1, 1, 0, 0], [0, 1, 0, 1]]) pred.attach_grad() with autograd.record(): loss = warp_loss(pred, target) loss.backward() print("warp loss with nn block ", loss) print("pred.grad", pred.grad) # test for autograd function edition of WARPLoss warp_funcloss = WARP_funcLoss(label_size=4) if use_identity:
def forward(self, is_train, req, in_data, out_data, aux): fea = in_data[0] data = in_data[1] weights = in_data[2] prob = in_data[3] prob = prob / 3 prob = nd.exp(prob) prob = prob/nd.sum(prob,axis=1,keepdims=1) w = nd.dot(prob,weights) w = nd.expand_dims(w,2) w = nd.expand_dims(w,3) fea_w = fea*w d_w = data.shape[3] d_h = data.shape[2] w = fea.shape[2] n = fea.shape[0] fea = nd.mean(fea_w,axis=1,keepdims=1) # fea = nd.contrib.BilinearResize2D(fea,height=4*w,width=4*w) # w = 4*w max_val = nd.max(fea,axis=(2,3),keepdims=1) fea = fea / max_val if is_train: fea_mask = nd.greater_equal(fea,0.1) fea_mask2 = nd.greater_equal(fea,0.25) else: fea_mask = nd.greater_equal(fea,0.1) fea_mask2 = nd.greater_equal(fea,0.25) fea_mask1 = -nd.Pooling(-fea_mask,kernel=(5,5),pool_type='max',pad=(2,2)) fea_mask1 = nd.Pooling(fea_mask1,kernel=(11,11),pool_type='max',pad=(5,5)) cmask = nd.sum(fea_mask1,axis=(2,3),keepdims=1) cmask = nd.greater(fea,4) fea_mask = cmask * fea_mask2 * fea_mask1 + (1-cmask)*fea_mask2 fea_mask = fea_mask[:,0,:,:].asnumpy() shape = self.outsize img_res = nd.zeros((n,3,shape,shape)) # fea_res = nd.zeros((n,shape,shape)) for i in range(n): m = fea_mask[i] try: arg = np.float32(np.where(m==1)) ymin = np.int32(np.floor(np.min(arg[0])*(d_h/w))) ymax = np.int32(np.ceil(np.max(arg[0])*(d_h/w))) xmin = np.int32(np.floor(np.min(arg[1])*(d_w/w))) xmax = np.int32(np.ceil(np.max(arg[1])*(d_w/w))) x_center = (xmin+xmax)/2 y_center = (ymin+ymax)/2 # x_length = xmax - xmin y_length = ymax - ymin longside = max(y_length,x_length) x = np.int(max(x_center-longside/2,0)) xmax = np.int(min(x_center+longside/2,d_w)) l_x = xmax-x y = np.int(max(y_center-longside/2,0)) ymax = np.int(min(y_center+longside/2,d_h)) l_y = ymax-y # fea0 = fea[i] # fea0 = nd.expand_dims(fea0,0) # fea0 = nd.expand_dims(fea0,0) # fea0 = nd.contrib.BilinearResize2D(fea0,height=d_h,width=d_w) # img_crop = data[i,:,y:y+l_y,x:x+l_x] except: print(arg) # fea_crop = fea0[0,:,y:y+l_y,x:x+l_x] img_crop = nd.expand_dims(img_crop,0) # fea_crop = nd.expand_dims(fea_crop,0) img_crop = nd.contrib.BilinearResize2D(img_crop,height=shape,width=shape) # fea_crop = nd.contrib.BilinearResize2D(fea_crop,height=shape,width=shape) # # if l_y > l_x: # longside = int((l_y/l_x)*resize) # img_crop = nd.contrib.BilinearResize2D(img_crop,height=longside,width=resize) # s = int(np.floor((longside-shape)/2)) # img_crop = img_crop[:,:,s:s+shape,s1:s1+shape] # else: # longside = int(l_x/l_y*resize) # img_crop = nd.contrib.BilinearResize2D(img_crop,height=resize,width=longside) # s = int(np.floor((longside-shape)/2)) # img_crop = img_crop[:,:,s1:s1+shape,s:s+shape] # img_res[i,:,:,:] = nd.squeeze(img_crop) # fea_res[i,:,:] = nd.squeeze(fea_crop) # fea_res = nd.expand_dims(fea_res,1) # img_res = img_res * fea_res self.assign(out_data[0], req[0], img_res)
def _sample_bernoulli(probability): return nd.greater(probability, nd.uniform(shape=probability.shape))