def forward(self, cls_pred, box_pred, cls_target, box_target): """Compute loss in entire batch across devices.""" # require results across different devices at this time cls_pred, box_pred, cls_target, box_target = [_as_list(x) \ for x in (cls_pred, box_pred, cls_target, box_target)] # cross device reduction to obtain positive samples in entire batch num_pos = [] for cp, bp, ct, bt in zip(*[cls_pred, box_pred, cls_target, box_target]): pos_samples = (ct > 0) num_pos.append(pos_samples.sum()) num_pos_all = sum([p.asscalar() for p in num_pos]) if num_pos_all < 1 and self._min_hard_negatives < 1: # no positive samples and no hard negatives, return dummy losses cls_losses = [nd.sum(cp * 0) for cp in cls_pred] box_losses = [nd.sum(bp * 0) for bp in box_pred] sum_losses = [nd.sum(cp * 0) + nd.sum(bp * 0) for cp, bp in zip(cls_pred, box_pred)] return sum_losses, cls_losses, box_losses # compute element-wise cross entropy loss and sort, then perform negative mining cls_losses = [] box_losses = [] sum_losses = [] for cp, bp, ct, bt in zip(*[cls_pred, box_pred, cls_target, box_target]): pred = nd.log_softmax(cp, axis=-1) pos = ct > 0 cls_loss = -nd.pick(pred, ct, axis=-1, keepdims=False) rank = (cls_loss * (pos - 1)).argsort(axis=1).argsort(axis=1) hard_negative = rank < nd.maximum(self._min_hard_negatives, pos.sum(axis=1) * self._negative_mining_ratio).expand_dims(-1) # mask out if not positive or negative cls_loss = nd.where((pos + hard_negative) > 0, cls_loss, nd.zeros_like(cls_loss)) cls_losses.append(nd.sum(cls_loss, axis=0, exclude=True) / max(1., num_pos_all)) bp = _reshape_like(nd, bp, bt) box_loss = nd.abs(bp - bt) box_loss = nd.where(box_loss > self._rho, box_loss - 0.5 * self._rho, (0.5 / self._rho) * nd.square(box_loss)) # box loss only apply to positive samples box_loss = box_loss * pos.expand_dims(axis=-1) box_losses.append(nd.sum(box_loss, axis=0, exclude=True) / max(1., num_pos_all)) sum_losses.append(cls_losses[-1] + self._lambd * box_losses[-1]) return sum_losses, cls_losses, box_losses
def mse(o, y): return nd.square(o - y).sum() * 0.5 / self.X.shape[0], nd.argmax( o, axis=1)
def grad_grad_op(x): return x/nd.sqrt((nd.square(x)+1)**3)
def forward(self, fg, bg, pred, mask, merg): c = fg * pred + (1 - pred) * bg dis = mask * (c - merg) l = nd.sqrt(self.eps + nd.square(dis).sum(0)) return l
def score(gradient, v, f): if 2 * f + 2 > v.shape[1]: f = int(math.floor((v.shape[1] - 2) / 2.0)) num_neighbours = v.shape[1] - 2 - f sorted_distance = nd.square(v - gradient).sum(axis=0).sort() return nd.sum(sorted_distance[1:(1 + num_neighbours)]).asscalar()
def forward(self, x): x = nd.sigmoid(nd.sum(nd.square(x), 1)) return x
def garchLLH(y, par): h = garchSim(nd.square(y), par) T = y.shape[0] llh = -0.5 * (T - 1) * math.log( 2 * math.pi) - 0.5 * nd.sum(nd.log(h) + (y / nd.sqrt(h))**2) return llh.asscalar()
def unlabeled_train_op_mmd_combine(self, update_enc=True): ''' Trains the MMD model ''' batch_size = self.args['batch_size'] model_ctx = self.model_ctx eps = 1e-10 # Retrieve data docs = self.data.get_documents(key='train') y_true = np.random.dirichlet(np.ones(self.ndim_y) * self.args['dirich_alpha'], size=batch_size) y_true = nd.array(y_true, ctx=model_ctx) with autograd.record(): ### reconstruction phase ### y_onehot_u = self.Enc(docs) y_onehot_u_softmax = nd.softmax(y_onehot_u) if self.args['latent_noise'] > 0: y_noise = np.random.dirichlet(np.ones(self.ndim_y) * self.args['dirich_alpha'], size=batch_size) y_noise = nd.array(y_noise, ctx=model_ctx) y_onehot_u_softmax = ( 1 - self.args['latent_noise'] ) * y_onehot_u_softmax + self.args['latent_noise'] * y_noise x_reconstruction_u = self.Dec(y_onehot_u_softmax) logits = nd.log_softmax(x_reconstruction_u) loss_reconstruction = nd.mean(nd.sum(-docs * logits, axis=1)) loss_total = loss_reconstruction * self.args['recon_alpha'] ### mmd phase ### if self.args['adverse']: y_fake = self.Enc(docs) y_fake = nd.softmax(y_fake) loss_mmd = mmd_loss(y_true, y_fake, ctx_model=model_ctx, t=self.args['kernel_alpha']) loss_total = loss_total + loss_mmd if self.args['l2_alpha'] > 0: loss_total = loss_total + self.args['l2_alpha'] * nd.mean( nd.sum(nd.square(y_onehot_u), axis=1)) loss_total.backward() self.optimizer_enc.step(1) self.optimizer_dec.step(1) # self.m.args['batch_size'] latent_max = nd.zeros(self.args['ndim_y'], ctx=model_ctx) for max_ind in nd.argmax(y_onehot_u, axis=1): latent_max[max_ind] += 1.0 latent_max /= batch_size latent_entropy = nd.mean( nd.sum(-y_onehot_u_softmax * nd.log(y_onehot_u_softmax + eps), axis=1)) latent_v = nd.mean(y_onehot_u_softmax, axis=0) dirich_entropy = nd.mean(nd.sum(-y_true * nd.log(y_true + eps), axis=1)) if self.args['adverse']: loss_mmd_return = loss_mmd.asscalar() else: loss_mmd_return = 0.0 return nd.mean(loss_reconstruction).asscalar( ), loss_mmd_return, latent_max.asnumpy(), latent_entropy.asscalar( ), latent_v.asnumpy(), dirich_entropy.asscalar()
even_split=False) angs_list = mx.gluon.utils.split_and_load(angs, ctx_list=devices, even_split=False) cate_list = mx.gluon.utils.split_and_load(cate, ctx_list=devices, even_split=False) loss_list = [] with mx.autograd.record(): for data, lmks, angs, cate in zip(data_list, lmks_list, angs_list, cate_list): lmks_regs = net(data) lmks_regs = nd.Flatten(lmks_regs) lmks_loss = nd.square(lmks_regs - lmks) lmks_loss = nd.sum(lmks_loss, axis=1) #angs_loss = 1 - mx.nd.cos((angs_regs - angs)) #angs_loss = mx.nd.sum(angs_loss, axis=1) loss = lmks_loss #if with_angle: # loss = loss * angs_loss if with_category: loss = loss * cate loss_list.append(loss)
def RecLoss(rec_x, x): x_reshape = x.reshape((0, -1)) diff = nd.square(x_reshape - rec_x) return nd.mean(nd.sum(diff, axis=1))
def flowdr(dem_fill,NoData,rows,cols,ctx,switch): ingrid = np.indices((rows, cols)) ingrid[0] # row indices ingrid[1] # column indices ingridxmx=nd.array(ingrid[1],ctx[0]).reshape((1,1,rows, cols)) ingridymx=nd.array(ingrid[0],ctx[0]).reshape((1,1,rows, cols)) dem_fillmx=nd.array(dem_fill,ctx[0]) demmx=dem_fillmx.reshape((1,1,rows, cols)) res=1 l=[0,1,2,3,4,5,6,7,0] direct=[1,2,4,8,16,32,64,128] direct_d=[[1,3],[2,6],[4,12],[8,24],[16,48],[32,96],[64,192],[128,129]] weight=[None]*8 weight1=[None]*8 convx=[None]*8 convy=[None]*8 convz=[None]*8 runlen=[1,ma.pow(2,0.5),1,ma.pow(2,0.5),1,ma.pow(2,0.5),1,ma.pow(2,0.5)]*res n = [[[] for x in range(3)] for x in range(8)]#create list to store normal vectors for each facet s = [None]*8 d = [None]*8 weight[0] = nd.array([[0, 0, 0], [0, 1, -1], [0, 0, 0]], gpu(0)) weight[1] = nd.array([[0, 0, -1], [0, 1, 0], [0, 0, 0]], gpu(0)) weight[2] = nd.array([[0, -1, 0], [0, 1, 0], [0, 0, 0]], gpu(0)) weight[3] = nd.array([[-1, 0, 0], [0, 1, 0], [0, 0, 0]], gpu(0)) weight[4] = nd.array([[0, 0, 0], [-1, 1, 0], [0, 0, 0]], gpu(0)) weight[5] = nd.array([[0, 0, 0], [0, 1, 0], [-1, 0, 0]], gpu(0)) weight[6] = nd.array([[0, 0, 0], [0, 1, 0], [0, -1, 0]], gpu(0)) weight[7] = nd.array([[0, 0, 0], [0, 1, 0], [0, 0, -1]], gpu(0)) weight1[0] = nd.array([[0, 0, 0], [0, 1, -10], [0, 0, 0]], gpu(0)) weight1[1] = nd.array([[0, 0, -10], [0, 1, 0], [0, 0, 0]], gpu(0)) weight1[2] = nd.array([[0, -10, 0], [0, 1, 0], [0, 0, 0]], gpu(0)) weight1[3] = nd.array([[-10, 0, 0], [0, 1, 0], [0, 0, 0]], gpu(0)) weight1[4] = nd.array([[0, 0, 0], [-10, 1, 0], [0, 0, 0]], gpu(0)) weight1[5] = nd.array([[0, 0, 0], [0, 1, 0], [-10, 0, 0]], gpu(0)) weight1[6] = nd.array([[0, 0, 0], [0, 1, 0], [0, -10, 0]], gpu(0)) weight1[7] = nd.array([[0, 0, 0], [0, 1, 0], [0, 0, -10]], gpu(0)) d0=nd.zeros((rows, cols),ctx[0],dtype='float32') dd=nd.zeros((rows, cols),ctx[0],dtype='float32') d_flat=nd.zeros((rows, cols),ctx[0],dtype='float32') flat=nd.zeros((rows, cols),ctx[0],dtype='float32') dep=nd.zeros((rows, cols),ctx[0],dtype='float32') high=nd.zeros((rows, cols),ctx[0],dtype='float32') fd=nd.zeros((rows, cols),ctx[0],dtype='float32')-999 d_compact=nd.zeros((rows, cols),ctx[0],dtype='float32')-1 for i in range(0,8): w=weight[i].reshape((1, 1, 3, 3)) convz[i] = nd.Convolution(data=demmx, weight=w, kernel=(3,3), no_bias=True, num_filter=1,pad=(1,1),cudnn_tune='off') convz[i]=convz[i][0,0,:,:] if switch==1 or 3: convx[i] = nd.Convolution(data=ingridxmx, weight=w, kernel=(3,3), no_bias=True, num_filter=1,pad=(1,1),cudnn_tune='off') convy[i] = nd.Convolution(data=ingridymx, weight=w, kernel=(3,3), no_bias=True, num_filter=1,pad=(1,1),cudnn_tune='off') convx[i]=convx[i][0,0,:,:] convy[i]=convy[i][0,0,:,:] if switch==1 or 3: for p in range(0,8):#8 facets from N-NE clockwise l0=l[p] l1=l[p+1] d[l0]=d0-999#Nodata value dmax=d0-999 smax=d0-999 n[l0][0]= convz[l0]*convy[l1]-convz[l1]*convy[l0]#nx n[l0][1]= convz[l0]*convx[l1]-convz[l1]*convx[l0]#ny n[l0][2]= convy[l0]*convx[l1]-convy[l1]*convx[l0]#nz #make boolean mask to determine direction d and slope s d[l0]=nd.where(condition=((n[l0][0]==0)*(n[l0][1]>=0)),x=d0,y=d[l0]) d[l0]=nd.where(condition=((n[l0][0]==0)*(n[l0][1])<0),x=d0+ma.pi,y=d[l0]) d[l0]=nd.where(condition=(n[l0][0]>0),x=ma.pi/2-nd.arctan(n[l0][1]/n[l0][0]),y=d[l0]) d[l0]=nd.where(condition=(n[l0][0]<0),x=3*ma.pi/2-nd.arctan(n[l0][1]/n[l0][0]),y=d[l0]) d[l0]=nd.where(condition=((convz[l0]<=0)*(convz[l1]<=0)),x=dmax,y=d[l0]) s[l0]=-nd.tan(nd.arccos(n[l0][2]/(nd.sqrt(nd.square(n[l0][0])+nd.square(n[l0][1])+nd.square(n[l0][2])))))#slope of the triangular facet s[l0]=nd.where(condition=((convz[l0]<=0)*(convz[l1]<=0)),x=smax,y=s[l0]) #Modify the scenario when the steepest slope is outside the 45 range of each facet dmax=nd.where(condition=((convz[l0]/runlen[l0]>=convz[l1]/runlen[l0])*(convz[l0]>0)),x=d0+ma.pi*l0/4,y=dmax) dmax=nd.where(condition=((convz[l0]/runlen[l0]<convz[l1]/runlen[l0])*(convz[l1]>0)),x=d0+ma.pi*(l0+1)/4,y=dmax) smax=nd.where(condition=((convz[l0]>=convz[l1])*(convz[l0]>0)),x=convz[l0]/runlen[l0],y=smax) smax=nd.where(condition=((convz[l0]<convz[l1])*(convz[l1]>0)),x=convz[l1]/runlen[l1],y=smax) d[l0]=nd.where(condition=((d[l0]<ma.pi*l0/4)+(d[l0]>ma.pi*l1/4)),x=dmax,y=d[l0]) s[l0]=nd.where(condition=((d[l0]<ma.pi*l0/4)+(d[l0]>ma.pi*l1/4)),x=smax,y=s[l0]) if switch==1: #flat and depressions indicator grid flat=(convz[l0]==0)+flat dep=(convz[l0]<0)+dep high=(convz[l0]>0)+high for q in range(0,8):#check if the 45 degree range angles need to be maintaied, otherwise delete (set to NoData) l0=l[q] l1=l[q+1] l2=l[q-1] dmax=d0-999 if q==0: dmax=nd.where(condition=(d[0]==d[1]),x=d[0],y=dmax) dmax=nd.where(condition=(d[0]==d[7]),x=d[0],y=dmax) d[0]=nd.where(condition=((d[0]==ma.pi*l0/4)+(d[0]==ma.pi*l1/4)),x=dmax,y=d[0]) else: dmax=nd.where(condition=(d[l0]==d[l1]),x=d[l0],y=dmax) dmax=nd.where(condition=(d[l0]==d[l2]),x=d[l0],y=dmax) d[l0]=nd.where(condition=((d[l0]==ma.pi*l0/4)+(d[l0]==ma.pi*l1/4)),x=dmax,y=d[l0]) #Check if flat or surface depression area. then lable with -1 or -10 respectively if switch==1: fd=nd.where(condition=(flat==8),x=d0-2,y=fd)#flats fd=nd.where(condition=(dep>=1)*(high==0),x=d0-3,y=fd)#high edge high_zero=nd.where(condition=(high==0),x=d0+1,y=d0) for j in range (0,8): if switch==1 or switch==2: d_flat=nd.where(condition=(convz[j]==0),x=d0+direct[j],y=d0)+d_flat if switch==1: flat_near=nd.where(condition=(convz[j]==0),x=d0+5,y=d0) dd1=high_zero+flat_near w=weight1[j].reshape((1, 1, 3, 3)) dd1=dd1.reshape((1,1,rows, cols)) conv_near= nd.Convolution(data=dd1, weight=w, kernel=(3,3), no_bias=True, num_filter=1,pad=(1,1),cudnn_tune='off') conv_near= conv_near[0,0,:,:] dd=nd.where(condition=(conv_near==-5)+(conv_near==-59)+(conv_near==-54)+(conv_near==-4),x=d0+1,y=d0)+dd if switch==1 or switch==3: d_compact=nd.where(condition=(d[j]==ma.pi*j/4),x=d0+direct_d[j][0],y=d_compact) d_compact=nd.where(condition=(d[j]>j*ma.pi/4)*(d[j]<(j+1)*ma.pi/4),x=d0+direct_d[j][1],y=d_compact) if switch==1 or switch==3: d_compact=nd.where(condition=(dem_fillmx==d0+NoData),x=d0-999,y=d_compact)#NoData if switch==1: fd=nd.where(condition=(dd>=1)*(high>=1),x=d0-1,y=fd)#low edge fd=nd.where(condition=(dep==8),x=d0-10,y=fd)#lowest points in depressions return (fd.asnumpy(),d_compact.asnumpy(),d_flat.asnumpy()) if switch==2: return (d_flat.asnumpy()) if switch==3: return (d_compact.asnumpy())
def Norm(x): return nd.sqrt(nd.sum(nd.square(x), axis=1, keepdims=True))
def _variance(a: nd.NDArray) -> nd.NDArray: """Compute variance of a of shape [n_samples, ...].""" mean = nd.mean(a, 0, keepdims=True) return nd.mean(nd.square(a - mean), 0)
def CapLoss(y_pred, y_true): L = y_true * nd.square(nd.maximum(0., 0.9 - y_pred)) + \ 0.5 * (1 - y_true) * nd.square(nd.maximum(0., y_pred - 0.1)) return nd.mean(nd.sum(L, 1))
def total_variation_loss(x): """ regularize convolutional masks (not currently in use) """ a = nd.square(x[:, :, :-1, :-1] - x[:, :, 1:, :-1]) b = nd.square(x[:, :, :-1, :-1] - x[:, :, :-1, 1:]) return nd.sum(nd.mean(nd.power(a + b, 1.25), axis=(2,3)))
def diffusion_kernel(a, tmpt, dim): # return (4 * np.pi * tmpt)**(-dim / 2) * nd.exp(- nd.square(nd.arccos(a)) / tmpt) return nd.exp(-nd.square(nd.arccos(a)) / tmpt)
def loss(predictions, targets): # return -nd.mean(targets * nd.log(predictions)) # return -nd.mean((targets * nd.log(predictions)) + ((1 - targets) * nd.log(1 - predictions))) return nd.mean(nd.square(predictions - targets))
def loss_fn(y_pred, y): return nd.mean(nd.square(y_pred - y))
def fit(self, num_steps=1): """ Fit the models Returns: Loss Functions (Q1-mse, Q2-mse, alpha-entropy, Policy-kl) """ logger_data = {k: [] for k in ["LossPi", "LossQ1", "LossQ2", "LossV"]} for step in range(num_steps): # sample a batch from memory minibatch = self.memory.sample(self.batch_size) obs = nd.array(minibatch["obs"], self.ctx) acts = nd.array(minibatch["act"], self.ctx) rewards = nd.array(minibatch["rew"], self.ctx) next_obs = nd.array(minibatch["next_obs"], self.ctx) nonterm = nd.array(minibatch["nt"], self.ctx) lr = self.lr(self.steps) * self.lrmult # update the policy function with autograd.record(): _mu, _pi, _logp_pi = self.policy(obs) _obspi = nd.concat(obs, _pi, dim=-1) _q1_pi = self.qfn1(_obspi) pi_loss = nd.mean(self.alpha * _logp_pi - _q1_pi) pi_loss.backward() self.mu.update(lr) self.logstd.update(lr) self.policy_base.update(lr) # update the value functions logp_pi = nd.stop_gradient(_logp_pi) obspi = nd.stop_gradient(_obspi) obsact = nd.concat(obs, acts, dim=-1) q1_pi = self.qfn1(obspi) q2_pi = self.qfn2(obspi) min_q_pi = nd.minimum(q1_pi, q2_pi) v_targ = self.vfn_targ(next_obs) q_backup = nd.stop_gradient(rewards + self.gamma * nonterm * v_targ) v_backup = nd.stop_gradient(min_q_pi - self.alpha * logp_pi) with autograd.record(): _q1 = self.qfn1(obsact) _q2 = self.qfn2(obsact) _v = self.vfn(obs) q1_loss = 0.5 * nd.mean(nd.square(q_backup - _q1)) q2_loss = 0.5 * nd.mean(nd.square(q_backup - _q2)) v_loss = 0.5 * nd.mean(nd.square(v_backup - _v)) total_loss = q1_loss + q2_loss + v_loss total_loss.backward() self.qfn1.update(lr) self.qfn2.update(lr) self.vfn.update(lr) # update the target network for i in range(len(self.vfn.weights)): self.vfn_targ.weights[i][:] = \ self.polyak * self.vfn_targ.weights[i][:] + \ (1 - self.polyak) * self.vfn.weights[i][:] logger_data["LossPi"].append(pi_loss.asnumpy()[0]) logger_data["LossQ1"].append(q1_loss.asnumpy()[0]) logger_data["LossQ2"].append(q2_loss.asnumpy()[0]) logger_data["LossV"].append(v_loss.asnumpy()[0]) return logger_data
def forward(self, x): x = nd.sqrt(nd.sum(nd.square(x), 1)) return x
def neglogp(action, mean, logstd): assert (mean.shape[-1] == logstd.shape[-1]) std = nd.exp(logstd) + 1e-8 return 0.5 * nd.sum(nd.square((action - mean) / std), axis=-1) \ + 0.5 * np.log(2.0 * np.pi) * action.shape[-1] \ + nd.sum(logstd, axis=-1)
if len(grads_list) >= args.nworkers: accumulate_param = 0 accumulate_grad = 0 if model_idx != len(params_prev_list) - 1: grads_prev = grads_list[-1] params_prev = params_prev_list[-1] else: grads_prev = grads_list[-2] params_prev = params_prev_list[-2] for param, param_prev, grad_prev in zip( net.collect_params().values(), params_prev, grads_prev): if param.grad_req != 'null': grad_current = param.grad() param_current = param.data() accumulate_param = accumulate_param + nd.square( param_current - param_prev).sum() accumulate_grad = accumulate_grad + nd.square( grad_current - grad_prev).sum() lips = math.sqrt(accumulate_grad.asscalar()) / math.sqrt( accumulate_param.asscalar()) if lips <= np.quantile(lips_list, quantile_q): byz_flag = False accept_counter = accept_counter + 1 nd.waitall() else: byz_flag = False accept_counter = accept_counter + 1 elif args.byz_test == 'zeno++': zeno_max_delay = args.zeno_delay zeno_rho = args.rho zeno_epsilon = args.epsilon
def squash(x, axis): s_squared_norm = nd.sum(nd.square(x), axis, keepdims=True) scale = s_squared_norm / (1 + s_squared_norm) / nd.sqrt(s_squared_norm + 1e-5) return scale * x
def _update_params(self, accumulated_grads): # scale gradients by lot size, add noise, and update the parameters for param_name, param in self._params.items(): # average the clipped gradients and then add noise to each averaged gradient param_grad_update = (accumulated_grads[param_name] / self._hyperparams['lot_size']) + \ mx.random.normal(0, self._hyperparams['sigma'], param.shape, ctx=self._model_ctx) # update biased first moment estimate self._m[param_name] = self._hyperparams['beta_1'] * self._m[param_name] + (1 - self._hyperparams['beta_1']) * param_grad_update # update biased second raw moment estimate self._v[param_name] = self._hyperparams['beta_2'] * self._v[param_name] + (1 - self._hyperparams['beta_2']) * nd.square(param_grad_update) # compute bias-corrected first moment estimate m_hat = self._m[param_name] / (1 - nd.power(self._hyperparams['beta_1'], self._step + 1)) # compute bias-corrected second raw moment estimate v_hat = self._v[param_name] / (1 - nd.power(self._hyperparams['beta_2'], self._step + 1)) # update params with ADAM param[:] = param - self._hyperparams['lr'] * m_hat / (nd.sqrt(v_hat) + 1e-8)
def Squash(vector, axis): norm = nd.sum(nd.square(vector), axis, keepdims=True) v_j = norm / (1 + norm) / nd.sqrt(norm, keepdims=True) * vector return v_j
def my_l2_loss(X, Y): num_instances = X.shape[0] return nd.sum(nd.square(X - Y)) / (2 * num_instances)
def forward(self, x): out = nd.sqrt(nd.sum(nd.square(x), self.axis)) return out
def forward(self, cls_pred, ori_pred, box_pred, cls_target, ori_target, box_target): """Compute loss in entire batch across devices.""" # require results across different devices at this time cls_pred, ori_pred, box_pred, cls_target, box_target = [_as_list(x) \ for x in (cls_pred, ori_pred, box_pred, cls_target, box_target)] # cross device reduction to obtain positive samples in entire batch num_pos = [] for cp, op, bp, ct, bt in zip( *[cls_pred, ori_pred, box_pred, cls_target, box_target]): pos_samples = (ct > 0) num_pos.append(pos_samples.sum()) num_pos_all = sum([p.asscalar() for p in num_pos]) if num_pos_all < 1 and self._min_hard_negatives < 1: # no positive samples and no hard negatives, return dummy losses cls_losses = [nd.sum(cp * 0) for cp in cls_pred] ori_losses = [nd.sum(op * 0) for op in ori_pred] box_losses = [nd.sum(bp * 0) for bp in box_pred] sum_losses = [ nd.sum(cp * 0) + nd.sum(op * 0) + nd.sum(bp * 0) for cp, op, bp in zip(cls_pred, ori_pred, box_pred) ] return sum_losses, cls_losses, ori_losses, box_losses # compute element-wise cross entropy loss and sort, then perform negative mining cls_losses = [] ori_losses = [] box_losses = [] sum_losses = [] for cp, op, bp, ct, ot, bt in zip( * [cls_pred, ori_pred, box_pred, cls_target, ori_target, box_target ]): pred = nd.log_softmax(cp, axis=-1) pos = ct > 0 cls_loss = -nd.pick(pred, ct, axis=-1, keepdims=False) rank = (cls_loss * (pos - 1)).argsort(axis=1).argsort(axis=1) hard_negative = rank < nd.maximum( self._min_hard_negatives, pos.sum(axis=1) * self._negative_mining_ratio).expand_dims(-1) # mask out if not positive or negative cls_loss = nd.where((pos + hard_negative) > 0, cls_loss, nd.zeros_like(cls_loss)) cls_losses.append( nd.sum(cls_loss, axis=0, exclude=True) / max(1., num_pos_all)) pred = nd.log_softmax(op, axis=-1) pos = ot > 0 ori_loss = -nd.pick(pred, ot, axis=-1, keepdims=False) rank = (ori_loss * (pos - 1)).argsort(axis=1).argsort(axis=1) hard_negative = rank < nd.maximum( self._min_hard_negatives, pos.sum(axis=1) * self._negative_mining_ratio).expand_dims(-1) # mask out if not positive or negative ori_loss = nd.where((pos + hard_negative) > 0, ori_loss, nd.zeros_like(ori_loss)) ori_losses.append( nd.sum(ori_loss, axis=0, exclude=True) / max(1., num_pos_all)) bp = _reshape_like(nd, bp, bt) box_loss = nd.abs(bp - bt) box_loss = nd.where(box_loss > self._rho, box_loss - 0.5 * self._rho, (0.5 / self._rho) * nd.square(box_loss)) # box loss only apply to positive samples box_loss = box_loss * pos.expand_dims(axis=-1) box_losses.append( nd.sum(box_loss, axis=0, exclude=True) / max(1., num_pos_all)) sum_losses.append(cls_losses[-1] + self._lambd * box_losses[-1]) sum_losses.append(ori_losses[-1] + self._lambd * box_losses[-1]) return sum_losses, cls_losses, ori_losses, box_losses
def forward(self, cls_pred, box_pred, coef_center_pred, coef_pred, cls_target, box_target, coef_center_target, coef_target): """Compute loss in entire batch across devices.""" # require results across different devices at this time # print(cls_pred[0].shape, box_pred[0].shape, coef_center_pred[0].shape) cls_pred, box_pred, coef_center_pred, coef_pred, cls_target, box_target, coef_center_target, coef_target = [_as_list(x) \ for x in (cls_pred, box_pred, coef_center_pred, coef_pred, cls_target, box_target, coef_center_target, coef_target)] # cross device reduction to obtain positive samples in entire batch num_pos = [] for cp, bp, ct, bt in zip( *[cls_pred, box_pred, cls_target, box_target]): pos_samples = (ct > 0) num_pos.append(pos_samples.sum()) num_pos_all = sum([p.asscalar() for p in num_pos]) if num_pos_all < 1: # no positive samples found, return dummy losses return nd.zeros((1, )), nd.zeros((1, )), nd.zeros((1, )), nd.zeros( (1, )), nd.zeros((1, )) # compute element-wise cross entropy loss and sort, then perform negative mining cls_losses = [] box_losses = [] coef_center_losses = [] coef_losses = [] sum_losses = [] for cp, bp, coefcp, coefp, ct, bt, coefct, coeft in zip(*[ cls_pred, box_pred, coef_center_pred, coef_pred, cls_target, box_target, coef_center_target, coef_target ]): pred = nd.log_softmax(cp, axis=-1) pos = ct > 0 cls_loss = -nd.pick(pred, ct, axis=-1, keepdims=False) rank = (cls_loss * (pos - 1)).argsort(axis=1).argsort(axis=1) hard_negative = rank < ( pos.sum(axis=1) * self._negative_mining_ratio).expand_dims(-1) # mask out if not positive or negative cls_loss = nd.where((pos + hard_negative) > 0, cls_loss, nd.zeros_like(cls_loss)) cls_losses.append( nd.sum(cls_loss, axis=0, exclude=True) / num_pos_all) # print(bp.shape, bt.shape) bp = _reshape_like(nd, bp, bt) box_loss = nd.abs(bp - bt) box_loss = nd.where(box_loss > self._rho, box_loss - 0.5 * self._rho, (0.5 / self._rho) * nd.square(box_loss)) # box loss only apply to positive samples box_loss = box_loss * pos.expand_dims(axis=-1) box_losses.append( nd.sum(box_loss, axis=0, exclude=True) / num_pos_all) coefcp = _reshape_like(nd, coefcp, coefct) coef_center_loss = nd.abs(coefcp - coefct) coef_center_loss = nd.where(coef_center_loss > self._rho, coef_center_loss - 0.5 * self._rho, (0.5 / self._rho) * nd.square(coef_center_loss)) coef_center_loss = coef_center_loss * pos.expand_dims(axis=-1) coef_center_losses.append( nd.sum(coef_center_loss, axis=0, exclude=True) / num_pos_all) coefp = _reshape_like(nd, coefp, coeft) coef_loss = nd.abs(coefp, coeft) coef_loss = nd.where(coef_loss > self._rho, coef_loss - 0.5 * self._rho, (0.5 / self._rho) * nd.square(coef_loss)) coef_loss = coef_loss * pos.expand_dims(axis=-1) coef_losses.append( nd.sum(coef_loss, axis=0, exclude=True) / num_pos_all) sum_losses.append(cls_losses[-1] + self._lambd * box_losses[-1] + coef_losses[-1] + coef_center_losses[-1]) return sum_losses, cls_losses, box_losses, coef_center_losses, coef_losses