def compute_retrospective_loss(self, observed_arr, encoded_arr, decoded_arr, re_encoded_arr): ''' Compute retrospective loss. Returns: The tuple data. - `np.ndarray` of delta. - `np.ndarray` of losses of each batch. - float of loss of all batch. ''' if self.__output_neuron_count == self.__hidden_neuron_count: target_arr = nd.broadcast_sub( encoded_arr, nd.expand_dims(observed_arr.mean(axis=2), axis=2)) summary_delta_arr = nd.sqrt(nd.power(decoded_arr - target_arr, 2)) else: # For each batch, draw a samples from the Uniform distribution. if self.__output_neuron_count > self.__hidden_neuron_count: all_dim_arr = np.arange(self.__output_neuron_count) np.random.shuffle(all_dim_arr) choiced_dim_arr = all_dim_arr[:self.__hidden_neuron_count] target_arr = nd.broadcast_sub( encoded_arr, nd.expand_dims(observed_arr[:, :, choiced_dim_arr].mean(axis=2), axis=2)) summary_delta_arr = nd.sqrt( nd.power(decoded_arr[:, :, choiced_dim_arr] - target_arr, 2)) else: all_dim_arr = np.arange(self.__hidden_neuron_count) np.random.shuffle(all_dim_arr) choiced_dim_arr = all_dim_arr[:self.__output_neuron_count] target_arr = nd.broadcast_sub( encoded_arr[:, :, choiced_dim_arr], nd.expand_dims(observed_arr.mean(axis=2), axis=2)) summary_delta_arr = nd.sqrt( nd.power(decoded_arr - target_arr, 2)) match_delta_arr = None for i in range(self.__batch_size): arr = nd.sqrt( nd.power(encoded_arr[i, -1] - re_encoded_arr[i, -1], 2)) if match_delta_arr is None: match_delta_arr = nd.expand_dims(arr, axis=0) else: match_delta_arr = nd.concat(match_delta_arr, nd.expand_dims(arr, axis=0), dim=0) delta_arr = summary_delta_arr + nd.expand_dims( self.__retrospective_lambda * match_delta_arr, axis=1) v = nd.norm(delta_arr) if v > self.__grad_clip_threshold: delta_arr = delta_arr * self.__grad_clip_threshold / v loss = nd.mean(delta_arr, axis=0, exclude=True) return loss
def backward_sample(self, total_feature, label): this_rank_classes = int(self.memory_bank.num_sample) local_index, unique_sorted_global_label = self.memory_bank.sample( label) # Get local index _mapping_dict = {} local_sampled_class = local_index + self.rank * self.memory_bank.num_local global_label_set = set(unique_sorted_global_label) for idx, absolute_label in enumerate(local_sampled_class): if absolute_label in global_label_set: _mapping_dict[ absolute_label] = idx + self.rank * self.memory_bank.num_sample label_list = list(label.asnumpy()) mapping_label = [] for i in range(len(label_list)): absolute_label = label_list[i] if absolute_label in _mapping_dict.keys(): mapping_label.append(_mapping_dict[absolute_label]) else: mapping_label.append(-1) mapping_label = nd.array(mapping_label, dtype=np.int32) # Get weight local_index = nd.array(local_index) local_index = self.get_ndarray2(self.gpu, "local_index", local_index) sample_weight, sample_weight_mom = self.memory_bank.get(local_index) # Sync to gpu if self.memory_bank.gpu: _data = self.get_ndarray2(self.gpu, "data_%d" % self.rank, total_feature) _weight = self.get_ndarray2(self.gpu, 'weight_%d' % self.rank, sample_weight) _weight_mom = self.get_ndarray2(self.gpu, 'weight_mom_%d' % self.rank, sample_weight_mom) else: _data = self.get_ndarray2(self.gpu, "data_%d" % self.rank, total_feature) _weight = self.get_ndarray2(self.gpu, 'weight_%d' % self.rank, sample_weight) _weight_mom = self.get_ndarray2(self.gpu, 'weight_mom_%d' % self.rank, sample_weight_mom) # Attach grad _data.attach_grad() _weight.attach_grad() # Convert label _label = self.get_ndarray2(self.gpu, 'mapping_label_%d' % self.rank, mapping_label) _label = _label - int(self.rank * self.memory_bank.num_sample) _fc7, _one_hot = self.fc7_model.forward(_data, _weight, mapping_label=_label, depth=this_rank_classes) # Sync max max_fc7 = nd.max(_fc7, axis=1, keepdims=True) max_fc7 = nd.reshape(max_fc7, -1) total_max_fc7 = self.get_ndarray(context=self.gpu, name='total_max_fc7', shape=(max_fc7.shape[0], self.size), dtype='float32') total_max_fc7[:] = 0 total_max_fc7[:, self.rank] = max_fc7 hvd.allreduce_(total_max_fc7, average=False) global_max_fc7 = self.get_ndarray(context=self.gpu, name='global_max_fc7', shape=(max_fc7.shape[0], 1), dtype='float32') nd.max(total_max_fc7, axis=1, keepdims=True, out=global_max_fc7) # Calculate exp(logits) _fc7_grad = nd.broadcast_sub(_fc7, global_max_fc7) _fc7_grad = nd.exp(_fc7_grad) # Calculate sum sum_fc7 = nd.sum(_fc7_grad, axis=1, keepdims=True) global_sum_fc7 = hvd.allreduce(sum_fc7, average=False) # Calculate grad _fc7_grad = nd.broadcast_div(_fc7_grad, global_sum_fc7) # Calculate loss tmp = _fc7_grad * _one_hot tmp = nd.sum(tmp, axis=1, keepdims=True) tmp = self.get_ndarray2(self.gpu, 'ctx_loss', tmp) tmp = hvd.allreduce(tmp, average=False) global_loss = -nd.mean(nd.log(tmp + 1e-30)) _fc7_grad = _fc7_grad - _one_hot # Backward _fc7.backward(out_grad=_fc7_grad) # Update center _weight_grad = _weight.grad self.memory_optimizer.update(weight=_weight, grad=_weight_grad, state=_weight_mom, learning_rate=self.memory_lr) if self.memory_bank.gpu: self.memory_bank.set(index=local_index, updated_weight=_weight, updated_weight_mom=_weight_mom) else: self.memory_bank.set(index=local_index, updated_weight=self.get_ndarray2( mx.cpu(), "cpu_weight_%d" % self.rank, _weight), updated_weight_mom=self.get_ndarray2( mx.cpu(), "cpu_weight_mom_%d" % self.rank, _weight_mom)) return _data.grad, global_loss
def backward(self, total_feature, label): memory_bank = self.memory_bank assert memory_bank.num_local == memory_bank.num_sample, "pass" _data = self.get_ndarray2(self.gpu, "data_%d" % self.rank, total_feature) # Attach grad _data.attach_grad() memory_bank.weight.attach_grad() # Convert label _label = self.get_ndarray2(self.gpu, 'label_%d' % self.rank, label) _label = _label - int(self.rank * memory_bank.num_local) _fc7, _one_hot = self.fc7_model.forward(_data, memory_bank.weight, mapping_label=_label, depth=memory_bank.num_local) # Sync max max_fc7 = nd.max(_fc7, axis=1, keepdims=True) max_fc7 = nd.reshape(max_fc7, -1) total_max_fc7 = self.get_ndarray(context=self.gpu, name='total_max_fc7', shape=(max_fc7.shape[0], self.size), dtype='float32') total_max_fc7[:] = 0 total_max_fc7[:, self.rank] = max_fc7 hvd.allreduce_(total_max_fc7, average=False) global_max_fc7 = self.get_ndarray(context=self.gpu, name='global_max_fc7', shape=(max_fc7.shape[0], 1), dtype='float32') nd.max(total_max_fc7, axis=1, keepdims=True, out=global_max_fc7) # Calculate exp(logits) _fc7_grad = nd.broadcast_sub(_fc7, global_max_fc7) _fc7_grad = nd.exp(_fc7_grad) # Calculate sum sum_fc7 = nd.sum(_fc7_grad, axis=1, keepdims=True) global_sum_fc7 = hvd.allreduce(sum_fc7, average=False) # Calculate prob _fc7_grad = nd.broadcast_div(_fc7_grad, global_sum_fc7) # Calculate loss tmp = _fc7_grad * _one_hot tmp = nd.sum(tmp, axis=1, keepdims=True) tmp = self.get_ndarray2(self.gpu, 'ctx_loss', tmp) tmp = hvd.allreduce(tmp, average=False) global_loss = -nd.mean(nd.log(tmp + 1e-30)) # Calculate fc7 grad _fc7_grad = _fc7_grad - _one_hot # Backward _fc7.backward(out_grad=_fc7_grad) # Update center _weight_grad = memory_bank.weight.grad self.memory_optimizer.update(weight=memory_bank.weight, grad=_weight_grad, state=memory_bank.weight_mom, learning_rate=self.memory_lr) return _data.grad, global_loss
def backward(self, out_grads=None): #print('in backward') assert self.binded and self.params_initialized #tmp_ctx = self._ctx_cpu tmp_ctx = self._ctx_single_gpu fc7_outs = [] ctx_fc7_max = self.get_ndarray(tmp_ctx, 'ctx_fc7_max', (self._batch_size, len(self._context))) #local_fc7_max = nd.zeros( (self.global_label.shape[0],1), ctx=mx.cpu()) arcface_module_outputs = [] for i, _module in enumerate(self._arcface_modules): #_fc7 = _module.get_outputs(merge_multi_context=True)[0] out = _module.get_outputs(merge_multi_context=True) #print(out[0].shape) #print(out[1].shape) arcface_module_outputs.append(out) _fc7 = out[0] fc7_outs.append(_fc7) _fc7_max = nd.max(_fc7, axis=1).as_in_context(tmp_ctx) ctx_fc7_max[:,i] = _fc7_max local_fc7_max = self.get_ndarray(tmp_ctx, 'local_fc7_max', (self._batch_size, 1)) nd.max(ctx_fc7_max, axis=1, keepdims=True, out=local_fc7_max) global_fc7_max = local_fc7_max #local_fc7_sum = None local_fc7_sum = self.get_ndarray(tmp_ctx, 'local_fc7_sum', (self._batch_size,1)) local_fc7_sum[:,:] = 0.0 for i, _module in enumerate(self._arcface_modules): _max = self.get_ndarray2(fc7_outs[i].context, 'fc7_max', global_fc7_max) fc7_outs[i] = nd.broadcast_sub(fc7_outs[i], _max) fc7_outs[i] = nd.exp(fc7_outs[i]) _sum = nd.sum(fc7_outs[i], axis=1, keepdims=True).as_in_context(tmp_ctx) local_fc7_sum += _sum global_fc7_sum = local_fc7_sum if self._iter%self._verbose==0: #_ctx = self._context[-1] _ctx = self._ctx_cpu _probs = [] for i, _module in enumerate(self._arcface_modules): _prob = self.get_ndarray2(_ctx, '_fc7_prob_%d'%i, fc7_outs[i]) _probs.append(_prob) fc7_prob = self.get_ndarray(_ctx, 'test_fc7_prob', (self._batch_size, self._ctx_num_classes*len(self._context))) nd.concat(*_probs, dim=1, out=fc7_prob) fc7_pred = nd.argmax(fc7_prob, axis=1) local_label = self.global_label - self._local_class_start #local_label = self.get_ndarray2(_ctx, 'test_label', local_label) _pred = nd.equal(fc7_pred, local_label) print('{fc7_acc}', self._iter, nd.mean(_pred).asnumpy()[0]) #local_fc1_grad = [] #fc1_grad_ctx = self._ctx_cpu fc1_grad_ctx = self._ctx_single_gpu local_fc1_grad = self.get_ndarray(fc1_grad_ctx, 'local_fc1_grad', (self._batch_size,self._emb_size)) local_fc1_grad[:,:] = 0.0 total_eloss = [] celoss_verbose = 1000 if self._iter%celoss_verbose==0: fc7_celoss = self.get_ndarray(tmp_ctx, 'test_fc7_celoss', (self._batch_size,)) fc7_celoss[:] = 0.0 for i, _module in enumerate(self._arcface_modules): _sum = self.get_ndarray2(fc7_outs[i].context, 'fc7_sum', global_fc7_sum) fc7_outs[i] = nd.broadcast_div(fc7_outs[i], _sum) a = i*self._ctx_num_classes b = (i+1)*self._ctx_num_classes _label = self.global_label - self._ctx_class_start[i] _label = self.get_ndarray2(fc7_outs[i].context, 'label', _label) onehot_label = self.get_ndarray(fc7_outs[i].context, 'label_onehot', (self._batch_size, self._ctx_num_classes)) nd.one_hot(_label, depth=self._ctx_num_classes, on_value = 1.0, off_value = 0.0, out=onehot_label) #print(fc7_outs[i].shape, onehot_label.shape) if self._iter%celoss_verbose==0: _ce_loss = fc7_outs[i] * onehot_label _ce_loss = nd.sum(_ce_loss, axis=1) fc7_celoss += _ce_loss.as_in_context(tmp_ctx) fc7_outs[i] -= onehot_label out = arcface_module_outputs[i] out_grads = [fc7_outs[i]] for j in range(1, len(out)): eloss = out[j] #print('eloss%d:'%j, eloss.shape) #print(out_grads[0].shape) #egrad_shape = (out_grads[0].shape[0], eloss.shape[0]) egrad_shape = eloss.shape egrad = self.get_ndarray(fc7_outs[i].context, 'egrad%d'%j, egrad_shape) #egrad[:][:] = 1.0/egrad_shape[0] egrad[:][:] = 1.0 out_grads.append(egrad) if self._iter%self._verbose==0: total_eloss.append(np.mean(eloss.asnumpy())) _module.backward(out_grads = out_grads) #ctx_fc1_grad = _module.get_input_grads()[0].as_in_context(mx.cpu()) ctx_fc1_grad = self.get_ndarray2(fc1_grad_ctx, 'ctx_fc1_grad_%d'%i, _module.get_input_grads()[0]) local_fc1_grad += ctx_fc1_grad if self._iter%self._verbose==0 and len(total_eloss)>0: print('{eloss}', self._iter, np.mean(total_eloss)) #if self._iter%self._verbose==0: if self._iter%celoss_verbose==0: ce_loss = nd.log(fc7_celoss) * -1.0 ce_loss = nd.mean(ce_loss) print('CELOSS,%d,%f'% (self._iter, ce_loss.asnumpy())) global_fc1_grad = local_fc1_grad self._curr_module.backward(out_grads = [global_fc1_grad])
def ISSM(z, b, F, a, g, sigma, m_prior, S_prior): ''' The documentation for this code can be found in : https://gluon.mxnet.io/chapter12_time-series/issm-scratch.html ''' H = F.shape[0] # dim of latent state T = z.shape[0] # num of observations eye_h = nd.array(np.eye(H)) mu_seq = [] S_seq = [] log_p_seq = [] for t in range(T): if t == 0: # At the first time step, use the prior mu_h = m_prior S_hh = S_prior else: # Otherwise compute using update eqns. F_t = F[:, :, t] g_t = g[:, t].reshape((H,1)) mu_h = gemm2(F_t, mu_t) S_hh = gemm2(F_t, gemm2(S_t, F_t, transpose_b=1)) + \ gemm2(g_t, g_t, transpose_b=1) a_t = a[:, t].reshape((H,1)) mu_v = gemm2(mu_h, a_t, transpose_a=1) # Compute the Kalman gain (vector) S_hh_x_a_t = gemm2(S_hh, a_t) sigma_t = sigma[t] S_vv = gemm2(a_t, S_hh_x_a_t, transpose_a=1) + nd.square(sigma_t) kalman_gain = nd.broadcast_div(S_hh_x_a_t, S_vv) # Compute the error (delta) delta = z[t] - b[t] - mu_v # Filtered estimates mu_t = mu_h + gemm2(kalman_gain, delta) # Joseph's symmetrized update for covariance: ImKa = nd.broadcast_sub(eye_h, gemm2(kalman_gain, a_t, transpose_b=1)) S_t = gemm2(gemm2(ImKa, S_hh), ImKa, transpose_b=1) + \ nd.broadcast_mul(gemm2(kalman_gain, kalman_gain, transpose_b=1), nd.square(sigma_t)) # likelihood term log_p = (-0.5 * (delta * delta / S_vv + np.log(2.0 * np.pi) + nd.log(S_vv)) ) mu_seq.append(mu_t) S_seq.append(S_t) log_p_seq.append(log_p) return log_p_seq
def backward(self, out_grads=None): #print('in backward') assert self.binded and self.params_initialized #tmp_ctx = self._ctx_cpu tmp_ctx = self._ctx_single_gpu fc7_outs = [] ctx_fc7_max = self.get_ndarray(tmp_ctx, 'ctx_fc7_max', (self._batch_size, len(self._context))) #local_fc7_max = nd.zeros( (self.global_label.shape[0],1), ctx=mx.cpu()) for i, _module in enumerate(self._arcface_modules): _fc7 = _module.get_outputs(merge_multi_context=True)[0] fc7_outs.append(_fc7) _fc7_max = nd.max(_fc7, axis=1).as_in_context(tmp_ctx) ctx_fc7_max[:,i] = _fc7_max local_fc7_max = self.get_ndarray(tmp_ctx, 'local_fc7_max', (self._batch_size, 1)) nd.max(ctx_fc7_max, axis=1, keepdims=True, out=local_fc7_max) global_fc7_max = local_fc7_max #local_fc7_sum = None local_fc7_sum = self.get_ndarray(tmp_ctx, 'local_fc7_sum', (self._batch_size,1)) local_fc7_sum[:,:] = 0.0 for i, _module in enumerate(self._arcface_modules): _max = self.get_ndarray2(fc7_outs[i].context, 'fc7_max', global_fc7_max) fc7_outs[i] = nd.broadcast_sub(fc7_outs[i], _max) fc7_outs[i] = nd.exp(fc7_outs[i]) _sum = nd.sum(fc7_outs[i], axis=1, keepdims=True).as_in_context(tmp_ctx) local_fc7_sum += _sum global_fc7_sum = local_fc7_sum if self._iter%self._verbose==0: #_ctx = self._context[-1] _ctx = self._ctx_cpu _probs = [] for i, _module in enumerate(self._arcface_modules): _prob = self.get_ndarray2(_ctx, '_fc7_prob_%d'%i, fc7_outs[i]) _probs.append(_prob) fc7_prob = self.get_ndarray(_ctx, 'test_fc7_prob', (self._batch_size, self._ctx_num_classes*len(self._context))) nd.concat(*_probs, dim=1, out=fc7_prob) fc7_pred = nd.argmax(fc7_prob, axis=1) local_label = self.global_label - self._local_class_start #local_label = self.get_ndarray2(_ctx, 'test_label', local_label) _pred = nd.equal(fc7_pred, local_label) print('{fc7_acc}', self._iter, nd.mean(_pred).asnumpy()[0]) #local_fc1_grad = [] #fc1_grad_ctx = self._ctx_cpu fc1_grad_ctx = self._ctx_single_gpu local_fc1_grad = self.get_ndarray(fc1_grad_ctx, 'local_fc1_grad', (self._batch_size,self._emb_size)) local_fc1_grad[:,:] = 0.0 loss = nd.zeros(shape=(self._batch_size), ctx=self._ctx_cpu) for i, _module in enumerate(self._arcface_modules): _sum = self.get_ndarray2(fc7_outs[i].context, 'fc7_sum', global_fc7_sum) fc7_outs[i] = nd.broadcast_div(fc7_outs[i], _sum) a = i*self._ctx_num_classes b = (i+1)*self._ctx_num_classes _label = self.global_label - self._ctx_class_start[i] _label = self.get_ndarray2(fc7_outs[i].context, 'label', _label) onehot_label = self.get_ndarray(fc7_outs[i].context, 'label_onehot', (self._batch_size, self._ctx_num_classes)) nd.one_hot(_label, depth=self._ctx_num_classes, on_value = 1.0, off_value = 0.0, out=onehot_label) #for debug loss -= (mx.nd.sum(mx.nd.log(fc7_outs[i]) * onehot_label, axis=1)).as_in_context(self._ctx_cpu) fc7_outs[i] -= onehot_label _module.backward(out_grads = [fc7_outs[i]]) print('for debug, fc7 outs max is ', i, mx.nd.max(fc7_outs[i])) print('for debug, fc7 outs min is ', i, mx.nd.min(fc7_outs[i])) #ctx_fc1_grad = _module.get_input_grads()[0].as_in_context(mx.cpu()) ctx_fc1_grad = self.get_ndarray2(fc1_grad_ctx, 'ctx_fc1_grad_%d'%i, _module.get_input_grads()[0]) local_fc1_grad += ctx_fc1_grad print('for debug, global fc1_grad max is ', i, mx.nd.max(ctx_fc1_grad)) print('for debug, ctx fc1 grad shape, ', ctx_fc1_grad.shape) global_fc1_grad = local_fc1_grad # global_fc1_grad = mx.nd.clip(local_fc1_grad, a_min=-15, a_max=15) print('for debug, after clip global fc1_grad max is ', mx.nd.max(global_fc1_grad)) self._curr_module.backward(out_grads = [global_fc1_grad]) # for debug return mx.nd.sum(loss)