def hybrid_forward(self, F, fts, ys, ftt, yt): """ Semantic Alignment Loss :param F: Function :param yt: label for the target domain [N] :param ftt: features for the target domain [N, K] :param ys: label for the source domain [M] :param fts: features for the source domain [M, K] :return: """ if self._fn: # Normalize ft fts = F.L2Normalization(fts, mode='instance') ftt = F.L2Normalization(ftt, mode='instance') fts_rpt = F.broadcast_to(fts.expand_dims(axis=0), shape=(self._bs_tgt, self._bs_src, self._embed_size)) ftt_rpt = F.broadcast_to(ftt.expand_dims(axis=1), shape=(self._bs_tgt, self._bs_src, self._embed_size)) dists = F.sum(F.square(ftt_rpt - fts_rpt), axis=2) yt_rpt = F.broadcast_to(yt.expand_dims(axis=1), shape=(self._bs_tgt, self._bs_src)).astype('int32') ys_rpt = F.broadcast_to(ys.expand_dims(axis=0), shape=(self._bs_tgt, self._bs_src)).astype('int32') y_same = F.equal(yt_rpt, ys_rpt).astype('float32') y_diff = F.not_equal(yt_rpt, ys_rpt).astype('float32') intra_cls_dists = dists * y_same inter_cls_dists = dists * y_diff max_dists = F.max(dists, axis=1, keepdims=True) max_dists = F.broadcast_to(max_dists, shape=(self._bs_tgt, self._bs_src)) revised_inter_cls_dists = F.where(y_same, max_dists, inter_cls_dists) max_intra_cls_dist = F.max(intra_cls_dists, axis=1) min_inter_cls_dist = F.min(revised_inter_cls_dists, axis=1) loss = F.relu(max_intra_cls_dist - min_inter_cls_dist + self._margin) return loss
def backward(self, req, out_grad, in_data, out_data, in_grad, aux): data = in_data[0] rois = in_data[1] BS, C, H, W = data.shape N = rois.shape[0] dout = out_grad[0] ddata = nd.zeros_like(data) rois = rois.asnumpy() for i in range(N): roi = rois[i] batch_id = roi[0].astype(np.int64) x1, y1, x2, y2 = roi[1:] * self.spatial_scale x1, y1, x2, y2 = np.floor(x1), np.floor(y1), np.ceil(x2), np.ceil( y2) x1, y1, x2, y2 = np.clip(x1, 0, W), np.clip(y1, 0, H), np.clip( x2, 0, W), np.clip(y2, 0, H) x1, y1, x2, y2 = x1.astype(np.int64), y1.astype( np.int64), x2.astype(np.int64), y2.astype(np.int64) if x1 >= x2 or y1 >= y2: continue h = y2 - y1 w = x2 - x1 # (C, h, w) roi_data = data[batch_id, :, y1:y2, x1:x2] # (h*w, C) roi_data = roi_data.reshape((C, -1)).transpose((1, 0)) # (h*w, C, 1) roi_data = roi_data.reshape((0, 0, 1)) # (h*w, C, C) out_product = nd.batch_dot(roi_data, roi_data.transpose((0, 2, 1))) # (C, C) if self.type == "max": reduce_product = nd.max(out_product, axis=0) max_mask = out_product == reduce_product # max_index = nd.argmax(out_product, axis=0) # max_index = max_index.reshape((C * C)) # d_max = nd.eye(h*w)[max_index].transpose((1, 0)).reshape((h*w, C, C)) dout_product = nd.stack(*[dout[i] for _ in range(h * w)]) * max_mask elif self.type == "mean": dout_product = nd.stack(*[dout[i] for _ in range(h * w)]) / (h * w) else: raise NotImplementedError() droi_data = [] for j in range(C): droi_data.append( nd.sum(dout_product[:, j, :] * roi_data[:, :, 0], axis=1) + nd.sum(dout_product[:, :, j] * roi_data[:, :, 0], axis=1)) droi_data = nd.stack(*droi_data, axis=1) # (hw, C) droi_data = droi_data.transpose((1, 0)).reshape((C, h, w)) ddata[batch_id, :, y1:y2, x1:x2] = droi_data self.assign(in_grad[0], req[0], ddata) self.assign(in_grad[1], req[1], nd.zeros_like(in_data[1]))
def hybrid_forward(self, F, x): x = self.first_conv(x) x = self.feature(x) x = self.conv_last(x) x = self.globalpool(x) x = self.LastSE(x) x = x.reshape(-1, 1280) x = self.fc(x) x = self.dropout(x) x = self.classifier(x) x = F.max(x) return x
def rgb_to_lab(image_srgb, ctx=None): if ctx is None: raise ValueError("ctx can not be None") if image_srgb is None: raise ValueError("image_srgb can not be None") with mx.Context(ctx): srgb = __check_image(image_srgb) if nd.max(srgb).asscalar() > 1: srgb = __normalize_rgb_image(srgb) srgb_pixels = nd.reshape(srgb, [-1, 3]) linear_mask = nd.cast(srgb_pixels <= 0.04045, dtype='float32') exponential_mask = nd.cast(srgb_pixels > 0.04045, dtype='float32') rgb_pixels = (srgb_pixels / 12.92 * linear_mask) + (((srgb_pixels + 0.055) / 1.055) ** 2.4) * exponential_mask rgb_to_xyz = nd.array([ # X Y Z [0.412453, 0.212671, 0.019334], # R [0.357580, 0.715160, 0.119193], # G [0.180423, 0.072169, 0.950227], # B ]) xyz_pixels = nd.linalg_gemm2(rgb_pixels, rgb_to_xyz) # https://en.wikipedia.org/wiki/Lab_color_space#CIELAB-CIEXYZ_conversions # convert to fx = f(X/Xn), fy = f(Y/Yn), fz = f(Z/Zn) # normalize for D65 white point xyz_normalized_pixels = nd.multiply(xyz_pixels, nd.array([1 / 0.950456, 1.0, 1 / 1.088754])) epsilon = 6 / 29 linear_mask = nd.cast(xyz_normalized_pixels <= (epsilon ** 3), dtype='float32') exponential_mask = nd.cast(xyz_normalized_pixels > (epsilon ** 3), dtype='float32') fxfyfz_pixels = (xyz_normalized_pixels / (3 * epsilon ** 2) + 4 / 29) * linear_mask + ( xyz_normalized_pixels ** ( 1 / 3)) * exponential_mask # convert to lab fxfyfz_to_lab = nd.array([ # l a b [0.0, 500.0, 0.0], # fx [116.0, -500.0, 200.0], # fy [0.0, 0.0, -200.0], # fz ]) lab_pixels = nd.linalg_gemm2(fxfyfz_pixels, fxfyfz_to_lab) + nd.array([-16.0, 0.0, 0.0]) return nd.reshape(lab_pixels, srgb.shape)
def forward(self, is_train=False): """Run forward on the current executor.""" #self.curr_execgrp.forward(is_train=is_train) self.get_each_gpu_label() # l2-norm forward self.weight_norm = nd.L2Normalization(self.weight, mode='instance') # fc forward no_bias = True if no_bias: nd.FullyConnected(data=self.data_batch, weight=self.weight_norm, no_bias=True, num_hidden=self.classes, out=self.fc_output) else: nd.FullyConnected(data=self.data_batch, weight=self.weight_norm, bias=self.bias, num_hidden=self.classes, out=self.fc_output) # margin forward self.get_each_gpu_label() if self.data_of_cur_gpu.size > 0: margin_temp = self.fc_output[self.data_of_cur_gpu, self.label_of_cur_gpu] self.pick_fc_of_cur_gpu = margin_temp.copy() tem_data = self.margin_loss(self.pick_fc_of_cur_gpu) self.fc_output[self.data_of_cur_gpu, self.label_of_cur_gpu] = tem_data[:] else: self.pick_fc_of_cur_gpu = None # softmax forward # first allreduce sum sum_fc = nd.sum(nd.exp(self.fc_output), axis=1) sum_fc = self.allreduce('global_sum_fc', sum_fc) assert len(sum_fc) > 0, "rank:{}, sum_fc".format(self.rank) self.global_sum_fc[:] = sum_fc[:] # second allreduce max max_fc = nd.max(self.fc_output, axis=1) max_fc = self.allreduce('global_max_fc', max_fc, op=perseus.PerseusOp.Max) assert len(max_fc) > 0, "rank:{}, max_fc".format(self.rank) self.global_max_fc[:] = max_fc[:]
def _forward_alg(self, feats, lens_): batch_size = feats.shape[0] tagset_size = feats.shape[2] length = feats.shape[1] init_alphas = nd.full((self.tagset_size, ), -10000.) init_alphas[self.tag_dictionary.get_idx_for_item(START_TAG)] = 0. forward_var_list = [init_alphas.tile((feats.shape[0], 1))] transitions = self.transitions.data().expand_dims(0).tile( (feats.shape[0], 1, 1)) for i in range(feats.shape[1]): emit_score = feats[:, i, :] tag_var = \ emit_score.expand_dims(2).tile((1, 1, transitions.shape[2])) + \ transitions + \ forward_var_list[i].expand_dims(2).tile((1, 1, transitions.shape[2])).transpose([0, 2, 1]) max_tag_var = nd.max(tag_var, axis=2) new_tag_var = tag_var - max_tag_var.expand_dims(2).tile( (1, 1, transitions.shape[2])) agg_ = nd.log(nd.sum(nd.exp(new_tag_var), axis=2)) forward_var_list.append( nd.full((feats.shape[0], feats.shape[2]), val=max_tag_var + agg_)) # cloned = forward_var.clone() # forward_var[:, i + 1, :] = max_tag_var + agg_ # forward_var = cloned forward_var = nd.stack(*forward_var_list)[ lens_, nd.array(list(range(feats.shape[0])), dtype='int32'), :] terminal_var = forward_var + \ self.transitions.data()[self.tag_dictionary.get_idx_for_item(STOP_TAG)].expand_dims(0).tile(( forward_var.shape[0], 1)) alpha = log_sum_exp_batch(terminal_var) return alpha
def make_grid(image_tensor, rows): cols = image_tensor.shape[0] // rows if image_tensor.ndim == 2: image_tensor = image_tensor.reshape(-1, 1, 28, 28) if image_tensor.ndim != 4: raise ValueError(f"Image tensor has wrong dimension. Expected 4, actual {image_tensor.ndim}") n, c, h, w = image_tensor.shape image_tensor = (image_tensor + 1) / 2 assert nd.max(image_tensor) <= 1 assert nd.min(image_tensor) >= 0 grid = image_tensor.reshape(rows, cols, c, h, w) grid = grid.transpose(axes=(0, 3, 1, 4, 2)) grid = grid.reshape(rows * h, cols * w, c).asnumpy() if grid.ndim == 3 and grid.shape[2] == 1: grid = grid.squeeze() return grid
def bilinear_roi_pooling(data, rois, spatial_scale, type="max"): """ :param data: (BS, C, H, W) :param rois: (N, 5) :param spatial_scale: float :param type: :return: """ assert isinstance(spatial_scale, float) BS, C, H, W = data.shape N = rois.shape[0] out_data = [] rois = rois.asnumpy() for i in range(N): roi = rois[i] batch_id = roi[0].astype(np.int64) x1, y1, x2, y2 = roi[1:] * spatial_scale x1, y1, x2, y2 = np.floor(x1), np.floor(y1), np.ceil(x2), np.ceil(y2) x1, y1, x2, y2 = np.clip(x1, 0, W), np.clip(y1, 0, H), np.clip(x2, 0, W), np.clip(y2, 0, H) x1, y1, x2, y2 = x1.astype(np.int64), y1.astype(np.int64), x2.astype( np.int64), y2.astype(np.int64) if x1 >= x2 or y1 >= y2: out_data.append( nd.zeros((C, C), ctx=data.context, dtype=data.dtype)) continue # (C, h, w) roi_data = data[batch_id, :, y1:y2, x1:x2] # (h*w, C) roi_data = roi_data.reshape((C, -1)).transpose((1, 0)) # (h*w, C, 1) roi_data = roi_data.reshape((0, 0, 1)) # (h*w, C, C) out_product = nd.batch_dot(roi_data, roi_data.transpose((0, 2, 1))) # (C, C) if type == "max": reduce_product = nd.max(out_product, axis=0) elif type == "mean": reduce_product = nd.mean(out_product, axis=0) else: raise NotImplementedError() out_data.append(reduce_product) out_data = nd.stack(*out_data) return out_data
def update(self, data, batch_size, episode_num, discount_factor): with autograd.record(): observations = nd.zeros((batch_size, 1, 128, 128)) actions = nd.zeros(batch_size) rewards = nd.zeros_like(actions) next_obs = nd.zeros_like(observations) dones = nd.zeros_like(actions) for i in range(batch_size): observations[i] = data[i].obs actions[i] = data[i].action rewards[i] = data[i].reward next_obs[i] = data[i].next_obs dones[i] = data[i].done actions = actions.reshape((-1, 1)) rewards = rewards.reshape((-1, 1)) dones = dones.reshape((-1, 1)) print('observations:', observations.shape) print('actions:', actions.shape) print('rewards:', rewards.shape) print('next observations:', next_obs.shape) print('dones:', dones.shape) not_dones = nd.array(np.logical_not(dones).astype('int8')) with autograd.predict_mode(): next_max_action_values = nd.max(self.model(next_obs), 1) target = nd.array( rewards) + discount_factor * next_max_action_values * not_dones del next_max_action_values obs_values = self.model(observations) obs_actions_values = nd.zeros_like(actions) for i in range(len(obs_actions_values)): obs_actions_values[i] = obs_values[i][actions[i]] del obs_values loss = self.loss(obs_actions_values, target) loss.backward() self.trainer.step(batch_size, True) return loss
def collect(self, name, arr): """Callback function for collecting min and max values from an NDArray.""" name = py_str(name) if self.include_layer is not None and not self.include_layer(name): return handle = ctypes.cast(arr, NDArrayHandle) arr = NDArray(handle, writable=False) min_range = ndarray.min(arr).asscalar() max_range = ndarray.max(arr).asscalar() if name in self.min_max_dict: cur_min_max = self.min_max_dict[name] self.min_max_dict[name] = (min(cur_min_max[0], min_range), max(cur_min_max[1], max_range)) else: self.min_max_dict[name] = (min_range, max_range) if self.logger is not None: self.logger.info("Collecting layer %s min_range=%f, max_range=%f" % (name, min_range, max_range))
def _predict_scores_batch(self, sentences: List[Sentence]): all_feats, tags, lengths = self.forward(sentences) overall_score = 0 all_tags_seqs = [] for feats in all_feats: # viterbi to get tag_seq if self.use_crf: score, tag_seq = self.viterbi_decode(feats) else: score, tag_seq = nd.max(feats, 1) tag_seq = list(tag_seq.data()) # overall_score += score all_tags_seqs.extend(tag_seq) return overall_score, all_tags_seqs
def _quantize_params(qsym, params, th_dict): """Given a quantized symbol and a dict of params that have not been quantized, generate quantized params. Currently only supports quantizing the arg_params with names of `weight` or `bias`, not aux_params. If `qsym` contains symbols that are excluded from being quantized, their corresponding params will not be quantized, but saved together with quantized params of the symbols that have been quantized. Parameters ---------- qsym : Symbol Quantized symbol from FP32 symbol. params : dict of str->NDArray th_dict: dict of min/max pairs of layers' output """ inputs_name = qsym.list_arguments() quantized_params = {} for name in inputs_name: if name.endswith(('weight_quantize', 'bias_quantize')): original_name = name[:-len('_quantize')] param = params[original_name] # pylint: disable=unbalanced-tuple-unpacking val, vmin, vmax = ndarray.contrib.quantize( data=param, min_range=ndarray.min(param), max_range=ndarray.max(param), out_type='int8') quantized_params[name] = val quantized_params[name + '_min'] = vmin quantized_params[name + '_max'] = vmax elif name in params: quantized_params[name] = params[name] elif name.endswith(('_min')): output = name[:-len('_min')] if output in th_dict: quantized_params[name] = ndarray.array([th_dict[output][0]]) elif name.endswith(('_max')): output = name[:-len('_min')] if output in th_dict: quantized_params[name] = ndarray.array([th_dict[output][1]]) return quantized_params
def forward(self, is_train, req, in_data, out_data, aux): data = in_data[0] rois = in_data[1] BS, C, H, W = data.shape N = rois.shape[0] out = [] rois = rois.asnumpy() for i in range(N): roi = rois[i] batch_id = roi[0].astype(np.int64) x1, y1, x2, y2 = roi[1:] * self.spatial_scale x1, y1, x2, y2 = np.floor(x1), np.floor(y1), np.ceil(x2), np.ceil( y2) x1, y1, x2, y2 = np.clip(x1, 0, W), np.clip(y1, 0, H), np.clip( x2, 0, W), np.clip(y2, 0, H) x1, y1, x2, y2 = x1.astype(np.int64), y1.astype( np.int64), x2.astype(np.int64), y2.astype(np.int64) if x1 >= x2 or y1 >= y2: out.append(nd.zeros((C, C), ctx=data.context, dtype=data.dtype)) continue # (C, h, w) roi_data = data[batch_id, :, y1:y2, x1:x2] # (h*w, C) roi_data = roi_data.reshape((C, -1)).transpose((1, 0)) # (h*w, C, 1) roi_data = roi_data.reshape((0, 0, 1)) # (h*w, C, C) out_product = nd.batch_dot(roi_data, roi_data.transpose((0, 2, 1))) if self.type == "max": reduce_product = nd.max(out_product, axis=0) elif self.type == "mean": reduce_product = nd.mean(out_product, axis=0) else: raise NotImplementedError() out.append(reduce_product) out = nd.stack(*out) self.assign(out_data[0], req[0], out)
def predict_multi(self, imgs): loader = DataLoader(imgs.as_in_context(self.ctx), self.batch_size, last_batch='keep') max_sims = [] labels = [] features = [] cls_center = nd.L2Normalization(self.cls_center) max_sims = [] labels = [] for data in loader: data_batch = mx.io.DataBatch(data=(data, ), pad=self.batch_size - data.shape[0]) self.model.forward(data_batch, is_train=False) embeddings = self.model.get_outputs()[0] features.append(embeddings) embeddings = nd.L2Normalization(embeddings, mode='instance') if self.cls_center is not None: temp1 = embeddings.expand_dims(axis=1) temp2 = cls_center.expand_dims(axis=0) dis_mat = nd.sum(temp1 * temp2, axis=2) max_sim = nd.max(dis_mat, axis=1) label = nd.argmax(dis_mat, axis=1) labels += list(label.asnumpy()) max_sims += list(max_sim.asnumpy()) else: label = None features = nd.concatenate(features, axis=0) if self.label_map is not None: labels = [self.label_map[int(x)] for x in labels] return (max_sims, labels), features
def backward_sample(self, total_feature, label): this_rank_classes = int(self.memory_bank.num_sample) local_index, unique_sorted_global_label = self.memory_bank.sample( label) # Get local index _mapping_dict = {} local_sampled_class = local_index + self.rank * self.memory_bank.num_local global_label_set = set(unique_sorted_global_label) for idx, absolute_label in enumerate(local_sampled_class): if absolute_label in global_label_set: _mapping_dict[ absolute_label] = idx + self.rank * self.memory_bank.num_sample label_list = list(label.asnumpy()) mapping_label = [] for i in range(len(label_list)): absolute_label = label_list[i] if absolute_label in _mapping_dict.keys(): mapping_label.append(_mapping_dict[absolute_label]) else: mapping_label.append(-1) mapping_label = nd.array(mapping_label, dtype=np.int32) # Get weight local_index = nd.array(local_index) local_index = self.get_ndarray2(self.gpu, "local_index", local_index) sample_weight, sample_weight_mom = self.memory_bank.get(local_index) # Sync to gpu if self.memory_bank.gpu: _data = self.get_ndarray2(self.gpu, "data_%d" % self.rank, total_feature) _weight = self.get_ndarray2(self.gpu, 'weight_%d' % self.rank, sample_weight) _weight_mom = self.get_ndarray2(self.gpu, 'weight_mom_%d' % self.rank, sample_weight_mom) else: _data = self.get_ndarray2(self.gpu, "data_%d" % self.rank, total_feature) _weight = self.get_ndarray2(self.gpu, 'weight_%d' % self.rank, sample_weight) _weight_mom = self.get_ndarray2(self.gpu, 'weight_mom_%d' % self.rank, sample_weight_mom) # Attach grad _data.attach_grad() _weight.attach_grad() # Convert label _label = self.get_ndarray2(self.gpu, 'mapping_label_%d' % self.rank, mapping_label) _label = _label - int(self.rank * self.memory_bank.num_sample) _fc7, _one_hot = self.fc7_model.forward(_data, _weight, mapping_label=_label, depth=this_rank_classes) # Sync max max_fc7 = nd.max(_fc7, axis=1, keepdims=True) max_fc7 = nd.reshape(max_fc7, -1) total_max_fc7 = self.get_ndarray(context=self.gpu, name='total_max_fc7', shape=(max_fc7.shape[0], self.size), dtype='float32') total_max_fc7[:] = 0 total_max_fc7[:, self.rank] = max_fc7 hvd.allreduce_(total_max_fc7, average=False) global_max_fc7 = self.get_ndarray(context=self.gpu, name='global_max_fc7', shape=(max_fc7.shape[0], 1), dtype='float32') nd.max(total_max_fc7, axis=1, keepdims=True, out=global_max_fc7) # Calculate exp(logits) _fc7_grad = nd.broadcast_sub(_fc7, global_max_fc7) _fc7_grad = nd.exp(_fc7_grad) # Calculate sum sum_fc7 = nd.sum(_fc7_grad, axis=1, keepdims=True) global_sum_fc7 = hvd.allreduce(sum_fc7, average=False) # Calculate grad _fc7_grad = nd.broadcast_div(_fc7_grad, global_sum_fc7) # Calculate loss tmp = _fc7_grad * _one_hot tmp = nd.sum(tmp, axis=1, keepdims=True) tmp = self.get_ndarray2(self.gpu, 'ctx_loss', tmp) tmp = hvd.allreduce(tmp, average=False) global_loss = -nd.mean(nd.log(tmp + 1e-30)) _fc7_grad = _fc7_grad - _one_hot # Backward _fc7.backward(out_grad=_fc7_grad) # Update center _weight_grad = _weight.grad self.memory_optimizer.update(weight=_weight, grad=_weight_grad, state=_weight_mom, learning_rate=self.memory_lr) if self.memory_bank.gpu: self.memory_bank.set(index=local_index, updated_weight=_weight, updated_weight_mom=_weight_mom) else: self.memory_bank.set(index=local_index, updated_weight=self.get_ndarray2( mx.cpu(), "cpu_weight_%d" % self.rank, _weight), updated_weight_mom=self.get_ndarray2( mx.cpu(), "cpu_weight_mom_%d" % self.rank, _weight_mom)) return _data.grad, global_loss
def log_sum_exp_batch(vecs): maxi = nd.max(vecs, 1) maxi_bc = maxi.expand_dims(1).tile((1, vecs.shape[1])) recti_ = nd.log(nd.sum(nd.exp(vecs - maxi_bc), 1)) return maxi + recti_
def test_max(): print("test max") tmp_dir = DIR + "max/" os.makedirs(tmp_dir + "0/", exist_ok=True) shape = np.random.randint(low=2, high=10, size=(4)) print(shape) a = np.random.randint(low=-127, high=127, size=shape) np.save(tmp_dir + "0/in_0.npy", a.astype("int32")) params = {'axis': [1, 3]} save_dict(params, tmp_dir + "0/attr.txt") b = nd.max(nd.array(a), **params) np.save(tmp_dir + "0/out_0.npy", b.asnumpy().astype("int32")) # print(b.asnumpy().astype("int32").flatten()) os.makedirs(tmp_dir + "1/", exist_ok=True) shape = np.random.randint(low=2, high=10, size=(4)) print(shape) a = np.random.randint(low=-127, high=127, size=shape) np.save(tmp_dir + "1/in_0.npy", a.astype("int32")) params = {} save_dict(params, tmp_dir + "1/attr.txt") b = nd.max(nd.array(a), **params) np.save(tmp_dir + "1/out_0.npy", b.asnumpy().astype("int32")) # print(b.asnumpy().astype("int32").flatten()) os.makedirs(tmp_dir + "2/", exist_ok=True) shape = np.random.randint(low=2, high=10, size=(4)) print(shape) a = np.random.randint(low=-127, high=127, size=shape) np.save(tmp_dir + "2/in_0.npy", a.astype("int32")) params = {'axis': [0]} save_dict(params, tmp_dir + "2/attr.txt") b = nd.max(nd.array(a), **params) np.save(tmp_dir + "2/out_0.npy", b.asnumpy().astype("int32")) # print(b.asnumpy().astype("int32").flatten()) os.makedirs(tmp_dir + "3/", exist_ok=True) shape = np.random.randint(low=2, high=10, size=(4)) print(shape) a = np.random.randint(low=-127, high=127, size=shape) np.save(tmp_dir + "3/in_0.npy", a.astype("int32")) params = {'axis': [2]} save_dict(params, tmp_dir + "3/attr.txt") b = nd.max(nd.array(a), **params) np.save(tmp_dir + "3/out_0.npy", b.asnumpy().astype("int32")) # print(b.asnumpy().astype("int32").flatten()) os.makedirs(tmp_dir + "4/", exist_ok=True) shape = np.random.randint(low=2, high=10, size=(4)) print(shape) a = np.random.randint(low=-127, high=127, size=shape) np.save(tmp_dir + "4/in_0.npy", a.astype("int32")) params = {'axis': [3]} save_dict(params, tmp_dir + "4/attr.txt") b = nd.max(nd.array(a), **params) np.save(tmp_dir + "4/out_0.npy", b.asnumpy().astype("int32")) # print(b.asnumpy().astype("int32").flatten()) os.makedirs(tmp_dir + "5/", exist_ok=True) shape = np.random.randint(low=2, high=10, size=(4)) print(shape) a = np.random.randint(low=-127, high=127, size=shape) np.save(tmp_dir + "5/in_0.npy", a.astype("int32")) params = {'axis': [1, 2, 3]} save_dict(params, tmp_dir + "5/attr.txt") b = nd.max(nd.array(a), **params) np.save(tmp_dir + "5/out_0.npy", b.asnumpy().astype("int32")) # print(b.asnumpy().astype("int32").flatten()) os.makedirs(tmp_dir + "6/", exist_ok=True) shape = np.random.randint(low=2, high=10, size=(4)) print(shape) a = np.random.randint(low=-127, high=127, size=shape) np.save(tmp_dir + "6/in_0.npy", a.astype("int32")) params = {'axis': [0, 1, 2, 3]} save_dict(params, tmp_dir + "6/attr.txt") b = nd.max(nd.array(a), **params) np.save(tmp_dir + "6/out_0.npy", b.asnumpy().astype("int32"))
def backward(self, total_feature, label): memory_bank = self.memory_bank assert memory_bank.num_local == memory_bank.num_sample, "pass" _data = self.get_ndarray2(self.gpu, "data_%d" % self.rank, total_feature) # Attach grad _data.attach_grad() memory_bank.weight.attach_grad() # Convert label _label = self.get_ndarray2(self.gpu, 'label_%d' % self.rank, label) _label = _label - int(self.rank * memory_bank.num_local) _fc7, _one_hot = self.fc7_model.forward(_data, memory_bank.weight, mapping_label=_label, depth=memory_bank.num_local) # Sync max max_fc7 = nd.max(_fc7, axis=1, keepdims=True) max_fc7 = nd.reshape(max_fc7, -1) total_max_fc7 = self.get_ndarray(context=self.gpu, name='total_max_fc7', shape=(max_fc7.shape[0], self.size), dtype='float32') total_max_fc7[:] = 0 total_max_fc7[:, self.rank] = max_fc7 hvd.allreduce_(total_max_fc7, average=False) global_max_fc7 = self.get_ndarray(context=self.gpu, name='global_max_fc7', shape=(max_fc7.shape[0], 1), dtype='float32') nd.max(total_max_fc7, axis=1, keepdims=True, out=global_max_fc7) # Calculate exp(logits) _fc7_grad = nd.broadcast_sub(_fc7, global_max_fc7) _fc7_grad = nd.exp(_fc7_grad) # Calculate sum sum_fc7 = nd.sum(_fc7_grad, axis=1, keepdims=True) global_sum_fc7 = hvd.allreduce(sum_fc7, average=False) # Calculate prob _fc7_grad = nd.broadcast_div(_fc7_grad, global_sum_fc7) # Calculate loss tmp = _fc7_grad * _one_hot tmp = nd.sum(tmp, axis=1, keepdims=True) tmp = self.get_ndarray2(self.gpu, 'ctx_loss', tmp) tmp = hvd.allreduce(tmp, average=False) global_loss = -nd.mean(nd.log(tmp + 1e-30)) # Calculate fc7 grad _fc7_grad = _fc7_grad - _one_hot # Backward _fc7.backward(out_grad=_fc7_grad) # Update center _weight_grad = memory_bank.weight.grad self.memory_optimizer.update(weight=memory_bank.weight, grad=_weight_grad, state=memory_bank.weight_mom, learning_rate=self.memory_lr) return _data.grad, global_loss
def forward(self, inputs, loss=None, training=True, commtype='average'): assert len(inputs) == self.slots + 1 if self.non_local_mode: return self.forward_non_local(inputs, loss, training) if self.message_embedding: return self.forward_message_embedding(inputs, loss, training) local_drop_vec = nd.ones_like(inputs[0]) local_drop_vec = self.local_dropout_op(local_drop_vec) for i in range(self.slots): inputs[i] = inputs[i] * local_drop_vec inputs[-1] = self.global_dropout_op(inputs[-1]) # local_share_vec = [] # local_private_vec = [] # if self.concrete_share_rate: # raise ValueError('no share_private!!!') # for i in range(self.slots): # proba = nd.sigmoid(data=self.share_rate[i].data()) # proba = nd.broadcast_axis(data=proba, axis=(0, 1), size=inputs[0].shape) # u_vec = nd.random_uniform(low=1e-5, high=1. - 1e-5, shape=inputs[0].shape, ctx=CTX) # local_share_vec.append(nd.sigmoid(10. * ( # nd.log(proba) - nd.log(1. - proba) + # nd.log(u_vec) - nd.log(1. - u_vec) # ))) # local_private_vec.append(1. - local_share_vec[i]) # # print 'proba:', proba # # print 'dropout_regularizer:', self.dropout_regularizer # if loss is not None: # loss.append( # self.dropout_regularizer * nd.sum(proba * nd.log(proba) + (1. - proba) * nd.log(1. - proba))) # if random.random() < 0.01: # for i in range(self.slots): # proba = nd.sigmoid(data=self.share_rate[i].data()) # print proba.asnumpy(), # print '' # else: # local_share_vec = [nd.ones_like(inputs[0]), ] * self.slots # local_private_vec = [nd.zeros_like(inputs[0]), ] * self.slots # local_share_vec = (1. - self.private_rate) * nd.Dropout( # nd.ones(shape=(inputs[0].shape[0], self.local_units)), p=self.private_rate, mode='always') # local_private_vec = 1. - local_share_vec comm_rate = nd.ones(shape=(self.slots + 1, self.slots + 1)) if self.use_comm and self.topo_learning_mode: proba = nd.sigmoid(self.topo.data()) if random.random() < 1e-2: print '---------------------------------------------' print proba.asnumpy() print '---------------------------------------------' u_vec = nd.random_uniform(low=1e-5, high=1. - 1e-5, shape=(self.slots + 1, self.slots + 1)) comm_rate = nd.sigmoid(10. * ( nd.log(proba) - nd.log(1. - proba) + nd.log(u_vec) - nd.log(1. - u_vec) )) if loss is not None: loss.append(4e-4 * nd.sum(proba * nd.log(proba) + (1. - proba) * nd.log(1. - proba))) results = [] for i in range(self.slots): results.append(self.local_share_trans.forward(inputs[i], training=training)) results.append(self.global_trans.forward(inputs[-1], training=training)) if self.use_comm: if self.topo_learning_mode: assert self.concrete_share_rate is False for i in range(self.slots): tmp = nd.zeros_like(results[i]) norm = nd.zeros_like(comm_rate[0][0]) for j in range(self.slots): if i != j: tmp = tmp + self.local2local_share_comm(inputs[j], training=training) * comm_rate[j][i] norm = norm + comm_rate[j][i] # results[i] = results[i] + self.global2local_comm(inputs[-1]) * comm_rate[-1][i] tmp = tmp + self.global2local_comm(inputs[-1], training=training) * comm_rate[-1][i] norm = norm + comm_rate[-1][i] if nd.sum(norm) > 1e-5: results[i] = results[i] + tmp / norm tmp = nd.zeros_like(results[-1]) norm = nd.zeros_like(comm_rate[0][0]) for j in range(self.slots): tmp = tmp + self.local2global_comm(inputs[j], training=training) * comm_rate[j][-1] norm = norm + comm_rate[j][-1] if nd.sum(norm) > 1e-5: results[-1] = results[-1] + tmp / norm else: if commtype == 'average': for i in range(self.slots): tmp = nd.zeros_like(results[i]) for j in range(self.slots): if j != i: tmp = tmp + self.local2local_share_comm.forward(inputs[j], training=training) tmp = tmp + self.global2local_comm.forward(inputs[-1], training=training) results[i] = results[i] + (tmp / float(self.slots)) tmp = nd.zeros_like(results[-1]) for i in range(self.slots): tmp = tmp + self.local2global_comm.forward(inputs[i], training=training) results[-1] = results[-1] + (tmp / float(self.slots)) elif commtype == 'maxpooling': for i in range(self.slots): tmp = [] for j in range(self.slots): if j != i: tmp.append(self.local2local_share_comm.forward(inputs[j], training=training)) tmp.append(self.global2local_comm.forward(inputs[-1], training=training)) for k in range(len(tmp)): tmp[k] = tmp[k].reshape((tmp[k].shape[0], 1, tmp[k].shape[1])) tmp = nd.concat(*tmp, dim=1) maxcomm = nd.max(tmp, axis=1) results[i] = results[i] + maxcomm tmp = [] for i in range(self.slots): tmp.append(self.local2global_comm.forward(inputs[i], training=training)) for k in range(len(tmp)): tmp[k] = tmp[k].reshape((tmp[k].shape[0], 1, tmp[k].shape[1])) tmp = nd.concat(*tmp, dim=1) maxcomm = nd.max(tmp, axis=1) results[-1] = results[-1] + maxcomm if self.block_mode: assert self.local_in_units == self.local_units assert self.global_in_units == self.global_units for i in range(self.slots): results[i] = self.yz_weight_local(results[i], training=training) + inputs[i] results[-1] = self.yz_weight_global(results[-1], training=training) + inputs[-1] return results
def forward(self, input_vec, loss=None, training=True): # print('************* ' + str(input_vec.shape[1]) + ' *************') # print('############# ' + str(input_vec.shape) + ' #############') assert input_vec.shape[1] == self.input_dimension # get inputs for every slot(including global) inputs = {} for slot in self.slots: inputs[slot] = input_vec[:, self.slot_dimension[slot][0]:self.slot_dimension[slot][1]] input_global = [] for seg in self.global_dimension: input_global.append(input_vec[:, seg[0]:seg[1]]) inputs['global'] = nd.concat(*input_global, dim=1) layer = [] # inputs -> first_hidden_layer if (not self.sort_input_vec) and self.state_feature != 'dip': layer.append([]) for slot in self.slots: layer[0].append(self.input_trans[slot](inputs[slot])) layer[0].append(self.input_trans['global'](inputs['global'])) elif self.state_feature == 'dip': sorted_inputs = [] for slot in self.slots: sorted_inputs.append(inputs[slot]) sorted_inputs.append(inputs['global']) layer.append(self.input_trans.forward(sorted_inputs, loss, training=training)) elif self.sort_input_vec: sorted_inputs = [] for slot in self.slots: tmp = inputs[slot][:, :-2].sort(is_ascend=False) if tmp.shape[1] < 20: tmp = nd.concat(tmp, nd.zeros((tmp.shape[0], 20 - tmp.shape[1]), ctx=CTX), dim=1) else: tmp = nd.slice_axis(tmp, axis=1, begin=0, end=20) sorted_inputs.append(nd.concat(tmp, inputs[slot][:, -2:], dim=1)) sorted_inputs.append(inputs['global']) layer.append(self.input_trans.forward(sorted_inputs, loss, training=training)) # hidden_layers for i in range(self.hidden_layers - 1): if self.recurrent_mode is False: # equal to 'layer.append(self.ma_trans[i](layer[-1], loss))' layer.append(self.ma_trans[i](layer[i], loss)) else: layer.append(self.ma_trans(layer[i], loss)) if self.share_last_layer is False: # dropout of last hidden layer for j in range(len(self.slots)): layer[-1][j] = self.local_out_drop_op.forward(layer[-1][j]) layer[-1][-1] = self.global_out_drop_op.forward(layer[-1][-1]) # last_hidden_layer -> outputs outputs = [] slotv_probs = [] slotqs = [] slot_probs = [] top_decision = [] for i in range(len(self.slots) + 1): if self.use_dueling is False: outputs.append(self.output_trans[i](layer[-1][i])) else: if i < len(self.slots): cur_slotv_prob = self.output_trans_local_valueP.forward(layer[-1][i], training=training) cur_slotv_prob = nd.softmax(cur_slotv_prob) else: cur_slotv_prob = self.output_trans_global_valueP.forward(layer[-1][i], training=training) cur_slotv_prob = nd.softmax(cur_slotv_prob) if self.dueling_share_last: if i < len(self.slots): cur_slotq = self.output_trans_local_slotQ.forward(layer[-1][i], training=training) cur_slot_prob = self.output_trans_local_slotP.forward(layer[-1][i], training=training).reshape(-1,1) cur_slotv_prob = cur_slotv_prob*cur_slot_prob # cur_slot_prob = nd.softmax(cur_slot_prob) if self.shared_last_layer_use_bias: cur_slotq = cur_slotq + nd.slice(self.value_bias_local.data(), begin=(i, ), end=(i + 1, )) else: cur_slotq = self.output_trans_global_slotQ.forward(layer[-1][i], training=training) cur_slot_prob = self.output_trans_global_slotP.forward(layer[-1][i], training=training).reshape(-1,1) cur_slotv_prob = cur_slotv_prob*cur_slot_prob # cur_slot_prob = nd.softmax(cur_slot_prob) top_decision.append(cur_slot_prob) else: cur_slotq = self.output_trans_value[i](layer[-1][i]) slotv_probs.append(cur_slotv_prob) slot_probs.append(cur_slot_prob) slotqs.append(cur_slotq) # batch_slotv_probs_list = [] # slot_prob_softmax = nd.softmax(nd.concat(*slot_probs, dim=1)) # slot_prob_split = nd.split(slot_prob_softmax, axis=1, num_outputs=len(self.slots)+1) # assert len(slotv_probs) == len(self.slots)+1 # for i in range(len(slotv_probs)): # tmp = slot_prob_split[i].reshape(-1,1)*slotv_probs[i] # batch_slotv_probs_list.append(tmp) batch_slot_prob = nd.softmax(nd.concat(*slot_probs, dim=1)) batch_slot_slotq = nd.concat(*slotqs, dim=1) batch_slotv_prob = nd.softmax(nd.concat(*slotv_probs, dim=1)) batch_top_decision = nd.softmax(nd.concat(*top_decision,dim=1)) # print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@') # print(batch_slotv_prob) # print(batch_slot_prob.shape) # print(batch_slot_slotq.shape) # print(batch_slotv_prob.shape) prob = batch_slotv_prob value = nd.max(batch_slot_slotq, axis=1) top_decision = batch_top_decision # CTname = threading.currentThread().getName() # print(CTname+' top decision is : ') # print(top_decision) return prob, value, top_decision
def softmax(y_linear): exp = nd.exp(y_linear - nd.max(y_linear)) partition = nd.sum(exp, axis=0, exclude=True).reshape((-1, 1)) return exp / partition
def transform_softmax(x): max_of_dim1 = nd.max(x, axis=1, keepdims=True) return (nd.exp(x - max_of_dim1).T / nd.exp(x - max_of_dim1).sum(axis=1, keepdims=True).T).T
def backward(self, out_grads=None): #print('in backward') assert self.binded and self.params_initialized #tmp_ctx = self._ctx_cpu tmp_ctx = self._ctx_single_gpu fc7_outs = [] ctx_fc7_max = self.get_ndarray(tmp_ctx, 'ctx_fc7_max', (self._batch_size, len(self._context))) #local_fc7_max = nd.zeros( (self.global_label.shape[0],1), ctx=mx.cpu()) arcface_module_outputs = [] for i, _module in enumerate(self._arcface_modules): #_fc7 = _module.get_outputs(merge_multi_context=True)[0] out = _module.get_outputs(merge_multi_context=True) #print(out[0].shape) #print(out[1].shape) arcface_module_outputs.append(out) _fc7 = out[0] fc7_outs.append(_fc7) _fc7_max = nd.max(_fc7, axis=1).as_in_context(tmp_ctx) ctx_fc7_max[:,i] = _fc7_max local_fc7_max = self.get_ndarray(tmp_ctx, 'local_fc7_max', (self._batch_size, 1)) nd.max(ctx_fc7_max, axis=1, keepdims=True, out=local_fc7_max) global_fc7_max = local_fc7_max #local_fc7_sum = None local_fc7_sum = self.get_ndarray(tmp_ctx, 'local_fc7_sum', (self._batch_size,1)) local_fc7_sum[:,:] = 0.0 for i, _module in enumerate(self._arcface_modules): _max = self.get_ndarray2(fc7_outs[i].context, 'fc7_max', global_fc7_max) fc7_outs[i] = nd.broadcast_sub(fc7_outs[i], _max) fc7_outs[i] = nd.exp(fc7_outs[i]) _sum = nd.sum(fc7_outs[i], axis=1, keepdims=True).as_in_context(tmp_ctx) local_fc7_sum += _sum global_fc7_sum = local_fc7_sum if self._iter%self._verbose==0: #_ctx = self._context[-1] _ctx = self._ctx_cpu _probs = [] for i, _module in enumerate(self._arcface_modules): _prob = self.get_ndarray2(_ctx, '_fc7_prob_%d'%i, fc7_outs[i]) _probs.append(_prob) fc7_prob = self.get_ndarray(_ctx, 'test_fc7_prob', (self._batch_size, self._ctx_num_classes*len(self._context))) nd.concat(*_probs, dim=1, out=fc7_prob) fc7_pred = nd.argmax(fc7_prob, axis=1) local_label = self.global_label - self._local_class_start #local_label = self.get_ndarray2(_ctx, 'test_label', local_label) _pred = nd.equal(fc7_pred, local_label) print('{fc7_acc}', self._iter, nd.mean(_pred).asnumpy()[0]) #local_fc1_grad = [] #fc1_grad_ctx = self._ctx_cpu fc1_grad_ctx = self._ctx_single_gpu local_fc1_grad = self.get_ndarray(fc1_grad_ctx, 'local_fc1_grad', (self._batch_size,self._emb_size)) local_fc1_grad[:,:] = 0.0 total_eloss = [] celoss_verbose = 1000 if self._iter%celoss_verbose==0: fc7_celoss = self.get_ndarray(tmp_ctx, 'test_fc7_celoss', (self._batch_size,)) fc7_celoss[:] = 0.0 for i, _module in enumerate(self._arcface_modules): _sum = self.get_ndarray2(fc7_outs[i].context, 'fc7_sum', global_fc7_sum) fc7_outs[i] = nd.broadcast_div(fc7_outs[i], _sum) a = i*self._ctx_num_classes b = (i+1)*self._ctx_num_classes _label = self.global_label - self._ctx_class_start[i] _label = self.get_ndarray2(fc7_outs[i].context, 'label', _label) onehot_label = self.get_ndarray(fc7_outs[i].context, 'label_onehot', (self._batch_size, self._ctx_num_classes)) nd.one_hot(_label, depth=self._ctx_num_classes, on_value = 1.0, off_value = 0.0, out=onehot_label) #print(fc7_outs[i].shape, onehot_label.shape) if self._iter%celoss_verbose==0: _ce_loss = fc7_outs[i] * onehot_label _ce_loss = nd.sum(_ce_loss, axis=1) fc7_celoss += _ce_loss.as_in_context(tmp_ctx) fc7_outs[i] -= onehot_label out = arcface_module_outputs[i] out_grads = [fc7_outs[i]] for j in range(1, len(out)): eloss = out[j] #print('eloss%d:'%j, eloss.shape) #print(out_grads[0].shape) #egrad_shape = (out_grads[0].shape[0], eloss.shape[0]) egrad_shape = eloss.shape egrad = self.get_ndarray(fc7_outs[i].context, 'egrad%d'%j, egrad_shape) #egrad[:][:] = 1.0/egrad_shape[0] egrad[:][:] = 1.0 out_grads.append(egrad) if self._iter%self._verbose==0: total_eloss.append(np.mean(eloss.asnumpy())) _module.backward(out_grads = out_grads) #ctx_fc1_grad = _module.get_input_grads()[0].as_in_context(mx.cpu()) ctx_fc1_grad = self.get_ndarray2(fc1_grad_ctx, 'ctx_fc1_grad_%d'%i, _module.get_input_grads()[0]) local_fc1_grad += ctx_fc1_grad if self._iter%self._verbose==0 and len(total_eloss)>0: print('{eloss}', self._iter, np.mean(total_eloss)) #if self._iter%self._verbose==0: if self._iter%celoss_verbose==0: ce_loss = nd.log(fc7_celoss) * -1.0 ce_loss = nd.mean(ce_loss) print('CELOSS,%d,%f'% (self._iter, ce_loss.asnumpy())) global_fc1_grad = local_fc1_grad self._curr_module.backward(out_grads = [global_fc1_grad])
def forward(self, inputs, loss=None, training=True, commtype='average', topo='FC'): assert len(inputs) == self.slots + 1 local_drop_vec = nd.ones_like(inputs[0]) local_drop_vec = self.local_dropout_op(local_drop_vec) for i in range(self.slots): inputs[i] = inputs[i] * local_drop_vec inputs[-1] = self.global_dropout_op(inputs[-1]) if topo == 'FC': comm_rate = nd.ones(shape=(self.slots + 1, self.slots + 1)) elif topo == 'FUC': comm_rate = nd.zeros(shape=(self.slots + 1, self.slots + 1)) elif topo == 'Master': comm_rate = nd.ones(shape=(self.slots + 1, self.slots + 1)) for i in range(self.slots): for j in range(self.slots): comm_rate[i][j] = 0 if self.use_comm and self.topo_learning_mode: proba = nd.sigmoid(self.topo.data()) if random.random() < 1e-2: print '---------------------------------------------' print proba.asnumpy() print '---------------------------------------------' u_vec = nd.random_uniform(low=1e-5, high=1. - 1e-5, shape=(self.slots + 1, self.slots + 1)) comm_rate = nd.sigmoid(10. * ( nd.log(proba) - nd.log(1. - proba) + nd.log(u_vec) - nd.log(1. - u_vec) )) if loss is not None: loss.append(4e-4 * nd.sum(proba * nd.log(proba) + (1. - proba) * nd.log(1. - proba))) results = [] for i in range(self.slots): results.append(self.local_share_trans.forward(inputs[i], training=training)) results.append(self.global_trans.forward(inputs[-1], training=training)) if commtype == 'average': for i in range(self.slots): tmp = nd.zeros_like(results[i]) norm = nd.zeros_like(comm_rate[0][0]) for j in range(self.slots): if i != j: tmp = tmp + self.local2local_share_comm.forward(nd.concat(inputs[j], dim=1), training=training) * comm_rate[j][i] norm = norm + comm_rate[j][i] # results[i] = results[i] + self.global2local_comm(inputs[-1]) * comm_rate[-1][i] tmp = tmp + self.global2local_comm.forward(nd.concat(inputs[-1], dim=1), training=training) * \ comm_rate[-1][i] norm = norm + comm_rate[-1][i] if nd.sum(norm) > 1e-5: results[i] = results[i] + tmp / norm tmp = nd.zeros_like(results[-1]) norm = nd.zeros_like(comm_rate[0][0]) for j in range(self.slots): tmp = tmp + self.local2global_comm.forward(nd.concat(inputs[j], dim=1), training=training) * \ comm_rate[j][-1] norm = norm + comm_rate[j][-1] if nd.sum(norm) > 1e-5: results[-1] = results[-1] + tmp / norm elif commtype == 'maxpooling': for i in range(self.slots): tmp = [] for j in range(self.slots): if j != i: tmp.append(self.local2local_share_comm.forward(inputs[j], training=training)) tmp.append(self.global2local_comm.forward(inputs[-1], training=training)) for k in range(len(tmp)): tmp[k] = tmp[k].reshape((tmp[k].shape[0], 1, tmp[k].shape[1])) tmp = nd.concat(*tmp, dim=1) maxcomm = nd.max(tmp, axis=1) results[i] = results[i] + maxcomm tmp = [] for i in range(self.slots): tmp.append(self.local2global_comm.forward(inputs[i], training=training)) for k in range(len(tmp)): tmp[k] = tmp[k].reshape((tmp[k].shape[0], 1, tmp[k].shape[1])) tmp = nd.concat(*tmp, dim=1) maxcomm = nd.max(tmp, axis=1) results[-1] = results[-1] + maxcomm return results
def forward(self, input_vec, loss=None, training=True): assert input_vec.shape[1] == self.input_dimension # get inputs for every slot(including global) inputs = {} for slot in self.slots: inputs[slot] = input_vec[:, self.slot_dimension[slot][0]:self.slot_dimension[slot][1]] input_global = [] for seg in self.global_dimension: input_global.append(input_vec[:, seg[0]:seg[1]]) inputs['global'] = nd.concat(*input_global, dim=1) layer = [] # inputs -> first_hidden_layer sorted_inputs = [] for slot in self.slots: sorted_inputs.append(inputs[slot]) sorted_inputs.append(inputs['global']) layer.append(self.input_trans.forward(sorted_inputs, loss, training=training)) # hidden_layers for i in range(self.hidden_layers - 1): layer.append(self.ma_trans[i](layer[i], loss)) if self.share_last_layer is False: # dropout of last hidden layer for j in range(len(self.slots)): layer[-1][j] = self.local_out_drop_op.forward(layer[-1][j]) layer[-1][-1] = self.global_out_drop_op.forward(layer[-1][-1]) # last_hidden_layer -> outputs slotv_probs = [] slotqs = [] slot_probs = [] top_decision = [] for i in range(len(self.slots) + 1): if i < len(self.slots): cur_slotv_prob = self.output_trans_local_valueP.forward(layer[-1][i], training=training) else: cur_slotv_prob = self.output_trans_global_valueP.forward(layer[-1][i], training=training) cur_slotv_prob_adv = cur_slotv_prob - nd.max(cur_slotv_prob, axis=1, keepdims=True) if i < len(self.slots): cur_slotq = self.output_trans_local_slotQ.forward(layer[-1][i], training=training) cur_slot_prob = self.output_trans_local_slotP.forward(layer[-1][i], training=training).reshape(-1, 1) if self.shared_last_layer_use_bias: cur_slotq = cur_slotq + nd.slice(self.value_bias_local.data(), begin=(i,), end=(i + 1,)) else: cur_slotq = self.output_trans_global_slotQ.forward(layer[-1][i], training=training) cur_slot_prob = self.output_trans_global_slotP.forward(layer[-1][i], training=training).reshape(-1, 1) cur_slotv_prob = cur_slot_prob + cur_slotv_prob_adv top_decision.append(cur_slot_prob) slotv_probs.append(cur_slotv_prob) slot_probs.append(cur_slot_prob) slotqs.append(cur_slotq) batch_slot_slotq = nd.concat(*slotqs, dim=1) batch_slotv_prob = nd.softmax(nd.concat(*slotv_probs, dim=1)) batch_top_decision = nd.softmax(nd.concat(*top_decision, dim=1)) prob = batch_slotv_prob value = nd.sum(batch_top_decision * batch_slot_slotq, axis=1) top_decision = batch_top_decision return prob, value, top_decision
def backward(self, out_grads=None): #print('in backward') assert self.binded and self.params_initialized ## ============= forward classifier layer =========== fc7_outs = [] for i, _module in enumerate(self._arcface_modules): _fc7 = _module.get_outputs(merge_multi_context=True)[0] fc7_outs.append(_fc7) ctx_max = map( lambda fc7_out: nd.max(fc7_out, axis=1, keepdims=True). as_in_context(self._ctx_single_gpu), fc7_outs) local_fc7_max = nd.max(nd.concat(*ctx_max, dim=1), axis=1, keepdims=True) fc7_exps = list( map( lambda fc7_out: nd.exp(fc7_out - local_fc7_max.as_in_context( fc7_out.context)), fc7_outs)) ctx_sum = map( lambda fc7_out: nd.sum(fc7_out, axis=1, keepdims=True). as_in_context(self._ctx_single_gpu), fc7_exps) exp_sum = nd.sum(nd.concat(*ctx_sum, dim=1), axis=1, keepdims=True) softmax_outs = list( map( lambda fc7_exp: nd.broadcast_div( fc7_exp, exp_sum.as_in_context(fc7_exp.context)), fc7_exps)) onehot_device_labels = [ nd.one_hot((self.global_label).as_in_context(device) - self._ctx_class_start[i], depth=self._ctx_num_classes, on_value=1.0, off_value=0.0) for i, device in enumerate(self._context) ] ## ============= verbose train accuracy and loss =========== if self._iter % self._verbose == 0: local_label = self.global_label - self._local_class_start fc7_pred = self.parall_argmax(softmax_outs, self._ctx_single_gpu) _pred = nd.equal(fc7_pred, local_label).asnumpy()[0] loss = self.parall_loss(softmax_outs, onehot_device_labels, self._ctx_single_gpu).asscalar() assert not math.isnan(loss) self.logger.info( '[Iter {}] train acc : {}, total loss : {}'.format( self._iter, np.mean(_pred), loss)) ## ============= backward large weight classifier layer with gradient =========== local_fc1_grad = self.get_ndarray_by_shape( self._ctx_single_gpu, 'local_fc1_grad', (self._batch_size, self._emb_size)) local_fc1_grad[:, :] = 0.0 for i, _module in enumerate(self._arcface_modules): _module.backward( out_grads=[softmax_outs[i] - onehot_device_labels[i]]) ctx_fc1_grad = self.get_ndarray_by_v_arr( self._ctx_single_gpu, 'ctx_fc1_grad_%d' % i, _module.get_input_grads()[0]) local_fc1_grad += ctx_fc1_grad ## ============= backward backbone =============== global_fc1_grad = local_fc1_grad self._backbone_module.backward(out_grads=[global_fc1_grad])
def log_sum_exp(vec): max_score = nd.max(vec).asscalar() return nd.log(nd.sum(nd.exp(vec - max_score))) + max_score
def hybrid_forward(self, F, preds, label): label = label.astype('float32') dist = F.sqrt(F.sum(F.square(preds), axis=1)) return label * F.square(dist) + (1 - label) * F.square( F.max(self._m - dist, 0))
def backward(self, out_grads=None): #print('in backward') assert self.binded and self.params_initialized #tmp_ctx = self._ctx_cpu tmp_ctx = self._ctx_single_gpu fc7_outs = [] ctx_fc7_max = self.get_ndarray(tmp_ctx, 'ctx_fc7_max', (self._batch_size, len(self._context))) #local_fc7_max = nd.zeros( (self.global_label.shape[0],1), ctx=mx.cpu()) for i, _module in enumerate(self._arcface_modules): _fc7 = _module.get_outputs(merge_multi_context=True)[0] fc7_outs.append(_fc7) _fc7_max = nd.max(_fc7, axis=1).as_in_context(tmp_ctx) ctx_fc7_max[:,i] = _fc7_max local_fc7_max = self.get_ndarray(tmp_ctx, 'local_fc7_max', (self._batch_size, 1)) nd.max(ctx_fc7_max, axis=1, keepdims=True, out=local_fc7_max) global_fc7_max = local_fc7_max #local_fc7_sum = None local_fc7_sum = self.get_ndarray(tmp_ctx, 'local_fc7_sum', (self._batch_size,1)) local_fc7_sum[:,:] = 0.0 for i, _module in enumerate(self._arcface_modules): _max = self.get_ndarray2(fc7_outs[i].context, 'fc7_max', global_fc7_max) fc7_outs[i] = nd.broadcast_sub(fc7_outs[i], _max) fc7_outs[i] = nd.exp(fc7_outs[i]) _sum = nd.sum(fc7_outs[i], axis=1, keepdims=True).as_in_context(tmp_ctx) local_fc7_sum += _sum global_fc7_sum = local_fc7_sum if self._iter%self._verbose==0: #_ctx = self._context[-1] _ctx = self._ctx_cpu _probs = [] for i, _module in enumerate(self._arcface_modules): _prob = self.get_ndarray2(_ctx, '_fc7_prob_%d'%i, fc7_outs[i]) _probs.append(_prob) fc7_prob = self.get_ndarray(_ctx, 'test_fc7_prob', (self._batch_size, self._ctx_num_classes*len(self._context))) nd.concat(*_probs, dim=1, out=fc7_prob) fc7_pred = nd.argmax(fc7_prob, axis=1) local_label = self.global_label - self._local_class_start #local_label = self.get_ndarray2(_ctx, 'test_label', local_label) _pred = nd.equal(fc7_pred, local_label) print('{fc7_acc}', self._iter, nd.mean(_pred).asnumpy()[0]) #local_fc1_grad = [] #fc1_grad_ctx = self._ctx_cpu fc1_grad_ctx = self._ctx_single_gpu local_fc1_grad = self.get_ndarray(fc1_grad_ctx, 'local_fc1_grad', (self._batch_size,self._emb_size)) local_fc1_grad[:,:] = 0.0 loss = nd.zeros(shape=(self._batch_size), ctx=self._ctx_cpu) for i, _module in enumerate(self._arcface_modules): _sum = self.get_ndarray2(fc7_outs[i].context, 'fc7_sum', global_fc7_sum) fc7_outs[i] = nd.broadcast_div(fc7_outs[i], _sum) a = i*self._ctx_num_classes b = (i+1)*self._ctx_num_classes _label = self.global_label - self._ctx_class_start[i] _label = self.get_ndarray2(fc7_outs[i].context, 'label', _label) onehot_label = self.get_ndarray(fc7_outs[i].context, 'label_onehot', (self._batch_size, self._ctx_num_classes)) nd.one_hot(_label, depth=self._ctx_num_classes, on_value = 1.0, off_value = 0.0, out=onehot_label) #for debug loss -= (mx.nd.sum(mx.nd.log(fc7_outs[i]) * onehot_label, axis=1)).as_in_context(self._ctx_cpu) fc7_outs[i] -= onehot_label _module.backward(out_grads = [fc7_outs[i]]) print('for debug, fc7 outs max is ', i, mx.nd.max(fc7_outs[i])) print('for debug, fc7 outs min is ', i, mx.nd.min(fc7_outs[i])) #ctx_fc1_grad = _module.get_input_grads()[0].as_in_context(mx.cpu()) ctx_fc1_grad = self.get_ndarray2(fc1_grad_ctx, 'ctx_fc1_grad_%d'%i, _module.get_input_grads()[0]) local_fc1_grad += ctx_fc1_grad print('for debug, global fc1_grad max is ', i, mx.nd.max(ctx_fc1_grad)) print('for debug, ctx fc1 grad shape, ', ctx_fc1_grad.shape) global_fc1_grad = local_fc1_grad # global_fc1_grad = mx.nd.clip(local_fc1_grad, a_min=-15, a_max=15) print('for debug, after clip global fc1_grad max is ', mx.nd.max(global_fc1_grad)) self._curr_module.backward(out_grads = [global_fc1_grad]) # for debug return mx.nd.sum(loss)
def max(input, dim): return nd.max(input, axis=dim)
def log_sum_exp(vec): max_score = nd.max(vec).asscalar() return nd.log(nd.sum(nd.exp(vec - max_score))) + max_score