def _forward_alg(self, feats): # Do the forward algorithm to compute the partition function alphas = [[0.] * self.tagset_size] alphas[0][self.tag2idx[START_TAG]] = 1 alphas = nd.array(alphas) # Iterate through the sentence for feat in feats: alphas_t = [] # The forward variables at this timestep for next_tag in range(self.tagset_size): # broadcast the emission score: it is the same regardless of # the previous tag emit_score = feat[next_tag].reshape((1, -1)) # the ith entry of trans_score is the score of transitioning to # next_tag from i trans_score = self.transitions.data()[next_tag].reshape( (1, -1)) # The ith entry of next_tag_var is the value for the # edge (i -> next_tag) before we do log-sum-exp next_tag_var = alphas * nd.exp(trans_score + emit_score) # The forward variable for this tag is log-sum-exp of all the # scores. alphas_t.append(nsum(next_tag_var)) alphas = nd.concat(*alphas_t, dim=0).reshape((1, -1)) terminal_var = alphas * nd.exp( self.transitions.data()[self.tag2idx[STOP_TAG]]) alpha = log_sum(terminal_var) return alpha
def regression_student_grad(student_outputs, teacher_pred, teacher_noise_precision): student_mean = student_outputs[0] student_var = student_outputs[1] grad_mean = nd.exp(-student_var) * (student_mean - teacher_pred) grad_var = (1 - nd.exp(-student_var) * (nd.square(student_mean - teacher_pred) + 1.0 / teacher_noise_precision)) / 2 return [grad_mean, grad_var]
def check_unary_func(x): f_exp = lambda x: nd.exp(x) f_exp_grad = lambda x: [nd.exp(x)] autograd_assert(x, func=f_exp, grad_func=f_exp_grad) f_half = lambda x: x/2 f_half_grad = lambda x: [nd.ones(x.shape) * 0.5] autograd_assert(x, func=f_half, grad_func=f_half_grad) f_square = lambda x: x**2 f_square_grad = lambda x: [2*x] autograd_assert(x, func=f_square, grad_func=f_square_grad)
def check_unary_func(x): f_exp = lambda x: nd.exp(x) f_exp_grad = lambda x: [nd.exp(x)] autograd_assert(x, func=f_exp, grad_func=f_exp_grad) f_half = lambda x: x / 2 f_half_grad = lambda x: [nd.ones(x.shape) * 0.5] autograd_assert(x, func=f_half, grad_func=f_half_grad) f_square = lambda x: x**2 f_square_grad = lambda x: [2 * x] autograd_assert(x, func=f_square, grad_func=f_square_grad)
def test_unary_func(): x = nd.uniform(shape=(4, 5)) f_exp = lambda x: nd.exp(x) f_exp_grad = lambda x: [nd.exp(x)] autograd_assert(x, func=f_exp, grad_func=f_exp_grad) f_half = lambda x: x/2 f_half_grad = lambda x: [nd.ones(x.shape) * 0.5] autograd_assert(x, func=f_half, grad_func=f_half_grad) f_square = lambda x: x**2 f_square_grad = lambda x: [2*x] autograd_assert(x, func=f_square, grad_func=f_square_grad)
def regression_student_grad(student_outputs, teacher_pred, teacher_noise_precision): student_mean = student_outputs[0] student_var = student_outputs[1] grad_mean = nd.exp(-student_var) * (student_mean - teacher_pred) grad_var = (1 - nd.exp(-student_var) * (nd.square(student_mean - teacher_pred) + 1.0 / teacher_noise_precision)) / 2 # print student_mean # print teacher_pred # print grad_mean.asnumpy(), grad_var.asnumpy() # ch = raw_input() return [grad_mean, grad_var]
def inference_g(self, observed_arr): ''' Inference with generator. Args: observed_arr: `mxnet.ndarray` of observed data points. Returns: Tuple data. - re-parametric data. - encoded data points. - re-encoded data points. ''' encoded_arr = self.model.encoder(observed_arr) decoded_arr = self.model.decoder(encoded_arr) re_encoded_arr = self.re_encoder_model(decoded_arr) anomaly_arr = nd.square(encoded_arr - re_encoded_arr) anomaly_arr = nd.expand_dims(nd.exp(anomaly_arr.mean(axis=1)), axis=1) mean_arr = nd.expand_dims(decoded_arr.mean(axis=1), axis=1) gauss_arr = nd.random.normal_like(data=observed_arr, loc=0, scale=3.0) re_param_arr = mean_arr + (gauss_arr * anomaly_arr) kl_arr = -0.5 * (1 + nd.log(anomaly_arr) - mean_arr + anomaly_arr) re_param_arr = re_param_arr + kl_arr return re_param_arr, encoded_arr, re_encoded_arr
def my_loss(data, nc, ns, nq): data = data.astype('float64') cls_data = nd.reshape(data[0:nc * ns], (nc, ns, -1)) cls_center = nd.mean(cls_data, axis=1) + 1e-10 data_center_dis = nd.norm(data[nc * ns:].expand_dims(axis=1) - cls_center.expand_dims(axis=0), axis=2)**2 weight = nd.zeros((nc * nq, nc), ctx=data.context, dtype='float64') for i in range(0, nc): weight[i * nq:i * nq + nq, i] = 1 weight2 = 1 - weight temp1 = nd.log_softmax(-data_center_dis, axis=1) temp2 = nd.sum(temp1, axis=1) temp3 = nd.sum(-temp2) label = nd.argmin(data_center_dis, axis=1) return temp3 / (nc * nq), label loss1 = nd.sum(data_center_dis * weight) temp = nd.sum(nd.exp(-data_center_dis), axis=1) loss2 = nd.sum(nd.log(temp)) if loss1 is np.nan or loss2 is np.nan: raise StopIteration return (loss1 + loss2) / (nc * nq), label
def forward(self, signal: nd.NDArray, teacher_forcing_prob: float, latent_space_override: nd.NDArray = None): """ Args: signal: Sin signal (m, signal_length), m - num of signals (batch_size) teacher_forcing_prob: The probability of activating the teacher forcing latent_space_override: The override value for the latent space. Returns: """ sig_embedding = self.encoder(signal) # (m,s), s - dim of the encoder embedding # Posterior of the latent space # Gaussian variance must be positive, therefore using log variance parametrization ls_mean, ls_log_var = self.latent_space(sig_embedding).split(axis=1, num_outputs=2) ls_std = nd.exp(ls_log_var * 0.5, axis=0) # Sampling from the unit gaussian instead of sampling from the latent space posterior # allow for gradient flow via latent_space_mean / latent_space_log_var parameters # z = (x-mu)/std, thus: x = mu + z*std normal_sample = nd.random_normal(0, 1, shape=ls_mean.shape) ls_val = ls_mean + ls_std * normal_sample if isinstance(latent_space_override, nd.NDArray): ls_val = latent_space_override length = signal.shape[1] reconstructed_sig = self.decoder(ls_val, length, signal, teacher_forcing_prob) # (m,length) return SinBAEOutput(ls_mean, ls_log_var, ls_val, reconstructed_sig)
def reparametrize(self, mu, logvar): ''' mu is a number and logvar is a ndarray ''' std = nd.exp(0.5 * logvar) eps = nd.random_normal(loc=0, scale=1, shape=std.shape).as_in_context(ctx) return mu + eps * std
def softmax(X): # X.shape = (256, 10) exp = nd.exp(X) # 假设exp是矩阵,这里对行进行求和,并要求保留axis 1, # 就是返回 (nrows, 1) 形状的矩阵 # partition.shape = (256, 1) partition = exp.sum(axis=1, keepdims=True) # a[i,j] = exp[i,j] / partition[i,1] a = exp / partition return a
def goodness_of_function_loss_function(self): # 取指数使得所有值 > 0 self.__batch_y_hat_exp = nd.exp(self.__batch_y_hat) # 求 partition 用于归一化概率 self.__batch_y_hat_partition = self.__batch_y_hat_exp.sum( axis=1, keepdims=True) self.__batch_y_hat_exp_divided_partition = self.__batch_y_hat_exp / self.__batch_y_hat_partition return -nd.log( nd.pick(self.__batch_y_hat_exp_divided_partition, self.__batch_y))
def test1(): x = nd.zeros((3, 4)) print(x) print(nd.ones((4, 4))) print(nd.array([[1, 2, 3], [4, 5, 6]])) tmp1 = nd.random_normal(0, 1, shape=(3, 4)) print(tmp1) print(tmp1.shape) print(tmp1.size) print(x + tmp1) print(nd.exp(tmp1))
def refine_anchor_generator(arm_anchor_boxes,arm_loc_preds): ''' function: input: arm_anchor_boxes: shape (1,h*w*num_anchors[:layers],4) arm_loc_preds: shape (batch,h*w*num_loc_pred[:layers]) ''' batch_size = arm_loc_preds.shape[0] arm_anchor_boxes = nd.concat(*[arm_anchor_boxes]*batch_size,dim=0) #(batch,h*w*num_anchors[:layers],4) arm_anchor_boxes_bs = nd.split(data=arm_anchor_boxes,axis=2,num_outputs=4)#(batch,all_anchors,1)*4 al = arm_anchor_boxes_bs[0] # left top x at = arm_anchor_boxes_bs[1] # left top y ar = arm_anchor_boxes_bs[2] # right below x ab = arm_anchor_boxes_bs[3] # right below y aw = ar - al ah = ab - at ax = (al+ar)/2.0 ay = (at+ab)/2.0 arm_loc_preds = nd.reshape(data=arm_loc_preds,shape=(0,-1,4)) #(batch,h*w*num_anchors[:layers],4) arm_loc_preds_bs = nd.split(data=arm_loc_preds,axis=2,num_outputs=4) ox_preds = arm_anchor_boxes_bs[0] oy_preds = arm_anchor_boxes_bs[1] ow_preds = arm_anchor_boxes_bs[2] oh_preds = arm_anchor_boxes_bs[3] ## TODO: RCNN Paper object ox = ox_preds * aw * 0.1 + ax oy = oy_preds * ah * 0.1 + ay ow = nd.exp(ow_preds * 0.2) * aw oh = nd.exp(oh_preds * 0.2) * ah out0 = ox - ow / 2.0 out1 = oy - oh / 2.0 out2 = ox + ow / 2.0 out3 = oy + oh / 2.0 refine_anchor = nd.concat(out0,out1,out2,out3,dim=2) # refine_anchor = nd.split(data=refine_anchor,axis=0,num_outputs=batch_size) return refine_anchor # (batch,h*w*num_anchors[:layers],4)
def cvt_output_for_predict(self,pred): #how to interprete net output according format_groundtruth() predCls,predObj, XYWH = self.format_net_output(pred) batchSize,height,width,boxNum,_= XYWH.shape X,Y,W,H = XYWH.split(num_outputs=4, axis=-1) #pdb.set_trace() DY = nd.tile(nd.arange(0,height,repeat=width*boxNum, ctx=XYWH.context).reshape((1,height,width,boxNum,1)), (batchSize,1,1,1,1) ) DX = nd.tile(nd.arange(0,width,repeat=boxNum,ctx=XYWH.context).reshape((1,1,width,boxNum,1)),(batchSize,height,1,1,1)) X = (X + DX) / width Y = (Y + DY) / height #pdb.set_trace() W = nd.exp(W) - 1 H = nd.exp(H) - 1 W = nd.clip(W,0,1) H = nd.clip(H,0,1) X = nd.clip(X,0,1) Y = nd.clip(Y,0,1) left = X top = Y right = nd.clip(left + W,0,1) bottom = nd.clip(top + H, 0, 1) corners = nd.concat(left,top,right,bottom,dim=-1) #nms requiring corner format return predCls, predObj, corners
def _forward_alg(self, feats, lens_): batch_size = feats.shape[0] tagset_size = feats.shape[2] length = feats.shape[1] init_alphas = nd.full((self.tagset_size, ), -10000.) init_alphas[self.tag_dictionary.get_idx_for_item(START_TAG)] = 0. forward_var_list = [init_alphas.tile((feats.shape[0], 1))] transitions = self.transitions.data().expand_dims(0).tile( (feats.shape[0], 1, 1)) for i in range(feats.shape[1]): emit_score = feats[:, i, :] tag_var = \ emit_score.expand_dims(2).tile((1, 1, transitions.shape[2])) + \ transitions + \ forward_var_list[i].expand_dims(2).tile((1, 1, transitions.shape[2])).transpose([0, 2, 1]) max_tag_var = nd.max(tag_var, axis=2) new_tag_var = tag_var - max_tag_var.expand_dims(2).tile( (1, 1, transitions.shape[2])) agg_ = nd.log(nd.sum(nd.exp(new_tag_var), axis=2)) forward_var_list.append( nd.full((feats.shape[0], feats.shape[2]), val=max_tag_var + agg_)) # cloned = forward_var.clone() # forward_var[:, i + 1, :] = max_tag_var + agg_ # forward_var = cloned forward_var = nd.stack(*forward_var_list)[ lens_, nd.array(list(range(feats.shape[0])), dtype='int32'), :] terminal_var = forward_var + \ self.transitions.data()[self.tag_dictionary.get_idx_for_item(STOP_TAG)].expand_dims(0).tile(( forward_var.shape[0], 1)) alpha = log_sum_exp_batch(terminal_var) return alpha
def forward(self, is_train=False): """Run forward on the current executor.""" #self.curr_execgrp.forward(is_train=is_train) self.get_each_gpu_label() # l2-norm forward self.weight_norm = nd.L2Normalization(self.weight, mode='instance') # fc forward no_bias = True if no_bias: nd.FullyConnected(data=self.data_batch, weight=self.weight_norm, no_bias=True, num_hidden=self.classes, out=self.fc_output) else: nd.FullyConnected(data=self.data_batch, weight=self.weight_norm, bias=self.bias, num_hidden=self.classes, out=self.fc_output) # margin forward self.get_each_gpu_label() if self.data_of_cur_gpu.size > 0: margin_temp = self.fc_output[self.data_of_cur_gpu, self.label_of_cur_gpu] self.pick_fc_of_cur_gpu = margin_temp.copy() tem_data = self.margin_loss(self.pick_fc_of_cur_gpu) self.fc_output[self.data_of_cur_gpu, self.label_of_cur_gpu] = tem_data[:] else: self.pick_fc_of_cur_gpu = None # softmax forward # first allreduce sum sum_fc = nd.sum(nd.exp(self.fc_output), axis=1) sum_fc = self.allreduce('global_sum_fc', sum_fc) assert len(sum_fc) > 0, "rank:{}, sum_fc".format(self.rank) self.global_sum_fc[:] = sum_fc[:] # second allreduce max max_fc = nd.max(self.fc_output, axis=1) max_fc = self.allreduce('global_max_fc', max_fc, op=perseus.PerseusOp.Max) assert len(max_fc) > 0, "rank:{}, max_fc".format(self.rank) self.global_max_fc[:] = max_fc[:]
def loss_function(recon_x, x, mu, logvar): """ recon_x: generating images x: origin images mu: latent mean logvar: latent log variance """ BCE = reconstruction_function(recon_x, x) # mse loss BCE = nd.sum(BCE) # loss = 0.5 * sum(1 - log(sigma^2) + mu^2 + sigma^2) KLD_element = (nd.power(mu, 2) + nd.exp(logvar)) * (-1) + 1 + logvar KLD = nd.sum(KLD_element) * (-0.5) # KLD_element = nd.exp(logvar) + nd.power(mu, 2) - logvar - 1 # KLD = nd.sum(KLD_element) * 0.5 # KL divergence return BCE + KLD
def calc_loss(self, signal: nd.NDArray, teacher_forcing_prob: float) -> (float, float): """ Compute gradients of the loss function with respect of the model parameters. Args: signal: Sin signal: (m, signal_length), m - num of signals (batch_size) teacher_forcing_prob: TODO Returns: L2 Loss between input and decoded signals, KLD loss """ decoded_signal_output = self(signal, teacher_forcing_prob) latent_space_mean = decoded_signal_output.latent_space_mean latent_space_log_var = decoded_signal_output.latent_space_log_var l2_loss = self.l2loss(signal, decoded_signal_output.decoded_signal) negative_kld = 0.5 * nd.sum( 1 + latent_space_log_var - latent_space_mean ** 2 - nd.exp(latent_space_log_var), axis=1) return l2_loss, -negative_kld
def rbf_kernels(self, x: NDArray, y: NDArray): """ Computes exp(-c ||x - y||^2). ||x - y||^2 = x . x + y . y - 2 x . y Compute each term separately. x is are original features, y are features used for similarity """ cross_products = nd.dot(x, y) x_products = nd.sum(sqr(x), axis=1, keepdims=True) x_products = nd.broadcast_axis(x_products, axis=1, size=y.shape[1]) y_products = nd.sum(sqr(y), axis=0, keepdims=True) y_products = nd.broadcast_axis(y_products, axis=0, size=x.shape[0]) sqr_difs = x_products + y_products - 2 * cross_products print(nd.mean(x_products), nd.mean(y_products), nd.mean(cross_products)) print(nd.mean(sqr_difs)) res = nd.exp(-0.05 * sqr_difs) print(res.shape) return res
def softplus(x): return nd.log(1. + nd.exp(x))
def get_free_energy(self, v): x = nd.dot(v, self.W) + self.hb vt = nd.dot(v, self.vb) ht = nd.sum(nd.log(1.0 + nd.exp(x)), axis=1) fe = -ht - vt #free energy, how to prevent scale return nd.mean(fe)
if i >= burn_in: if 0 == i%thinning_interval: if (i+1) % (total_iter_num/sample_num) == 0: sgld_sample_list.append(copy_param(teacher_exe)) # print student_exe.grad_arrays # print student_params # print student_params_grad # ch = raw_input() X_student_batch = X_batch + numpy.random.normal(0, 0.05, X_batch.shape) teacher_exe.arg_dict['data'][:] = X_student_batch teacher_exe.forward(is_train=False) teacher_exe.outputs[0].wait_to_read() teacher_pred = teacher_exe.outputs[0] student_exe.arg_dict['data'][:] = X_student_batch student_exe.forward(is_train=True) print numpy.hstack((X_batch*X_batch*X_batch, teacher_exe.outputs[0].asnumpy(), student_exe.outputs[0].asnumpy(), nd.exp(student_exe.outputs[1]).asnumpy())) print 'Student Loss:', student_loss(student_exe.outputs[0], student_exe.outputs[1], teacher_pred, teacher_noise_precision) student_exe.backward(student_grad(student_exe.outputs[0], student_exe.outputs[1], teacher_pred, teacher_noise_precision)) for k in student_params: student_updater(k, student_params_grad[k], student_params[k]) distilled_sgld_mse, distilled_sgld_ret = \ pred_test(testing_data=testing_data, exe=student_exe, save_path='toy-1d-distilled-sgld.txt') sgld_mse, sgld_ret = \ pred_test(testing_data=testing_data, exe=teacher_exe, param_list=sgld_sample_list, save_path='toy-1d-sgld.txt')
def student_grad(student_mean, student_var, teacher_pred, teacher_noise_precision): grad_mean = nd.exp(-student_var) * (student_mean - teacher_pred) grad_var = (1 - nd.exp(-student_var) * (nd.square(student_mean - teacher_pred) + 1 / teacher_noise_precision))/2 return [grad_mean, grad_var]
def student_loss(student_mean, student_var, teacher_pred, teacher_noise_precision): return (0.5 * (student_var + nd.exp(-student_var) * (nd.square(teacher_pred - student_mean) + 1 / teacher_noise_precision))).asnumpy()[ 0]
def log_sum_exp(vec): max_score = nd.max(vec).asscalar() return nd.log(nd.sum(nd.exp(vec - max_score))) + max_score
def softmax(X): # 通过Softmax函数将任意输入归一化称合法的概率值 exp = nd.exp(X) # 指数 # 对行求和 partition = exp.sum(axis=1, keepdims=True) # keepdims=True 保持其二维特性 return exp / partition
def forward(self, is_train, req, in_data, out_data, aux): self.assign(out_data[0], req[0], 1.0 / (1.0 + nd.exp(- in_data[0])))
def softmax(x): # softmax的作用就是把一个输出转换为概率 exp = nd.exp(x) # exp函数将x当中的所有数据变更为正数 partition = exp.sum(axis=1, keepdims=True) # 对第一维的数据求和,即第一列数据 return exp / partition
def exp(input): return nd.exp(input)
x = nd.array([[1, 2], [3, 4]]) print(x) # 创建随机数组,每个元素的值都是随机采样而来,经常被用于初始化模型参数 y = nd.random_normal(0, 1, shape=(3, 4)) print(y) print(y.shape) print(y.size) x = nd.random_normal(0, 1, shape=(3, 4)) print(x) print(x + y) print(x * y) # 指数运算. print(nd.exp(y)) # 转置 print(nd.dot(x, y.T)) # 广播 a = nd.arange(3).reshape((3, 1)) b = nd.arange(2).reshape((1, 2)) print('a:', a) print('b:', b) print('a+b:', a + b) # 跟 Numpy 的转换 x = np.ones((2, 3)) y = nd.array(x) z = y.asnumpy() print([z, y])
def softmax(X): exp = nd.exp(X) partition = exp.sum(axis = 1, keepdims=True) # return (nrows, 1) matrix return exp / partition
def softmax(X): exp = nd.exp(X) # 假设exp是矩阵,这里对行进行求和,并要求保留axis 1, # 就是返回 (nrows, 1) 形状的矩阵 partition = exp.sum(axis=1, keepdims=True) return exp / partition
def logsigmoid(val): max_elem = nd.maximum(0., -val) z = nd.exp(-max_elem) + nd.exp(-val - max_elem) return -(max_elem + nd.log(z))
def softmax(x): exp = nd.exp(x) pariition = exp.sum(axis=1, keepdim=True) return exp / partition
def softmax(X): exp = nd.exp(X) partition = exp.sum(axis=1, keepdims=True) return exp / partition
def evaluate_accuracy(data_iterator, net, W, b): acc = 0. for data, label in data_iterator: output = net(data, W, b) acc += accuracy(output, label) return acc / len(data_iterator) if __name__ == '__main__': X = nd.random_normal(shape=(2, 5)) X_prob = softmax(X) print(X) print(X_prob) print(nd.exp(X[0][0]) / (nd.exp(X[0][0]) + nd.exp(X[1][0]))) # 1. 数据 mnist_train = gluon.data.vision.FashionMNIST(train=True, transform=transform) mnist_test = gluon.data.vision.FashionMNIST(train=False, transform=transform) batch_size = 256 train_data = gluon.data.DataLoader(mnist_train, batch_size, shuffle=True) test_data = gluon.data.DataLoader(mnist_test, batch_size, shuffle=False) # 2. 模型(线性模型)W,b num_inputs = 28 * 28 num_outputs = 10 W = nd.random_normal(shape=(num_inputs, num_outputs)) b = nd.random_normal(shape=num_outputs)
def softmax(X): X_exp = nd.exp(X) partition = X_exp.sum(axis=0, keepdims=True) return X_exp / partition