def f_fc(a, b, weight, bias): x = a * b fc = nd.FullyConnected(x, weight, bias, num_hidden=32) return fc
def backward(self): """Run backward on the current executor.""" #self.curr_execgrp.backward() # softmax self.get_each_gpu_label() self.logit = nd.exp(self.fc_output)[:] self.logit /= self.global_sum_fc.reshape((self.batchsize, 1))[:] self.grad[:] = self.logit[:] #.copy() #[:] #.copy() #assert self.data_of_cur_gpu.size > 0 if self.data_of_cur_gpu.size > 0: self.grad[self.data_of_cur_gpu, self.label_of_cur_gpu] -= 1.0 self.loss[self.data_of_cur_gpu] = -nd.log( nd.maximum( self.logit[self.data_of_cur_gpu, self.label_of_cur_gpu], 1e-32))[:] else: #print(self.data_of_cur_gpu) pass # margin if self.data_of_cur_gpu.size > 0: grad_fc = self.pick_fc_of_cur_gpu grad_fc.attach_grad() with autograd.record(): s = self.margin_loss(grad_fc) s.backward(self.grad[self.data_of_cur_gpu, self.label_of_cur_gpu]) self.grad[ self.data_of_cur_gpu, self.label_of_cur_gpu] = grad_fc.grad.copy() #[:] #.copy() self.pick_fc_of_cur_gpu = None # fc self.data_batch.attach_grad() #self.weight.attach_grad() self.weight_norm.attach_grad() self.bias.attach_grad() with autograd.record(): no_bias = True if no_bias: nd.FullyConnected(data=self.data_batch, weight=self.weight_norm, no_bias=True, num_hidden=self.classes, out=self.fc_output) else: nd.FullyConnected(data=self.data_batch, weight=self.weight_norm, bias=self.bias, num_hidden=self.classes, out=self.fc_output) self.fc_output.backward(self.grad) self.return_feature_grad = self.data_batch.grad.copy( ) #[:] #.copy() #[:] #.copy() #self.weight_grad += self.weight.grad self.weight_temp_grad[:] = self.weight_norm.grad[:] #self.bias_grad += self.bias.grad # allreduce grad self.return_feature_grad = self.allreduce('return_feature_grad', self.return_feature_grad) assert len(self.return_feature_grad), "rank:{}, grad".format(self.rank) #print('all feature grad:', self.return_feature_grad) self.return_each_gpu_grad = self.return_feature_grad[ self.each_gpu_batchsize * self.rank:self.each_gpu_batchsize * (self.rank + 1)] # l2-norm self.weight.attach_grad() with autograd.record(): s2 = nd.L2Normalization(self.weight, mode='instance') s2.backward(self.weight_temp_grad) #weight_grad) self.weight_grad += self.weight.grad
def network( X, drop_rate=0.0 ): # formula : output_size=((input−weights+2*Padding)/Stride)+1 #data size # MNIST,FashionMNIST = (batch size , 1 , 28 , 28) # CIFAR = (batch size , 3 , 32 , 32) # builtin The BatchNorm function moving_mean, moving_var does not work. C_H1 = nd.Activation( data=nd.BatchNorm(data=nd.Convolution(data=X, weight=W1, bias=B1, kernel=(3, 3), stride=(1, 1), num_filter=60), gamma=gamma1, beta=beta1, moving_mean=ma1, moving_var=mv1, momentum=0.9, fix_gamma=False, use_global_stats=True), act_type="relu" ) # MNIST : result = ( batch size , 60 , 26 , 26) , CIFAR10 : : result = ( batch size , 60 , 30 , 30) P_H1 = nd.Pooling( data=C_H1, pool_type="avg", kernel=(2, 2), stride=(2, 2) ) # MNIST : result = (batch size , 60 , 13 , 13) , CIFAR10 : result = (batch size , 60 , 15 , 15) C_H2 = nd.Activation( data=nd.BatchNorm(data=nd.Convolution(data=P_H1, weight=W2, bias=B2, kernel=(6, 6), stride=(1, 1), num_filter=30), gamma=gamma2, beta=beta2, moving_mean=ma2, moving_var=mv2, momentum=0.9, fix_gamma=False, use_global_stats=True), act_type="relu" ) # MNIST : result = ( batch size , 30 , 8 , 8), CIFAR10 : result = ( batch size , 30 , 10 , 10) P_H2 = nd.Pooling( data=C_H2, pool_type="avg", kernel=(2, 2), stride=(2, 2) ) # MNIST : result = (batch size , 30 , 4 , 4) , CIFAR10 : result = (batch size , 30 , 5 , 5) P_H2 = nd.flatten(data=P_H2) '''FullyConnected parameter • data: (batch_size, input_dim) • weight: (num_hidden, input_dim) • bias: (num_hidden,) • out: (batch_size, num_hidden) ''' F_H1 = nd.Activation(nd.BatchNorm(data=nd.FullyConnected( data=P_H2, weight=W3, bias=B3, num_hidden=120), gamma=gamma3, beta=beta3, moving_mean=ma3, moving_var=mv3, momentum=0.9, fix_gamma=False, use_global_stats=True), act_type="relu") F_H1 = nd.Dropout(data=F_H1, p=drop_rate) F_H2 = nd.Activation(nd.BatchNorm(data=nd.FullyConnected( data=F_H1, weight=W4, bias=B4, num_hidden=64), gamma=gamma4, beta=beta4, moving_mean=ma4, moving_var=mv4, momentum=0.9, fix_gamma=False, use_global_stats=True), act_type="relu") F_H2 = nd.Dropout(data=F_H2, p=drop_rate) #softmax_Y = nd.softmax(nd.FullyConnected(data=F_H2 ,weight=W5 , bias=B5 , num_hidden=10)) out = nd.FullyConnected(data=F_H2, weight=W5, bias=B5, num_hidden=10) return out
def network( X, is_training=True, drop_rate=0.0 ): # formula : output_size=((input−weights+2*Padding)/Stride)+1 #data size # MNIST,FashionMNIST = (batch size , 1 , 28 , 28) # CIFAR = (batch size , 3 , 32 , 32) C_H1 = nd.Activation( data=BatchNorm(nd.Convolution(data=X, weight=W1, bias=B1, kernel=(3, 3), stride=(1, 1), num_filter=60), gamma1, beta1, scope_name=0, is_training=is_training), act_type="relu" ) # MNIST : result = ( batch size , 60 , 26 , 26) , CIFAR10 : : result = ( batch size , 60 , 30 , 30) P_H1 = nd.Pooling( data=C_H1, pool_type="avg", kernel=(2, 2), stride=(2, 2) ) # MNIST : result = (batch size , 60 , 13 , 13) , CIFAR10 : result = (batch size , 60 , 15 , 15) C_H2 = nd.Activation( data=BatchNorm(nd.Convolution(data=P_H1, weight=W2, bias=B2, kernel=(6, 6), stride=(1, 1), num_filter=30), gamma2, beta2, scope_name=1, is_training=is_training), act_type="relu" ) # MNIST : result = ( batch size , 30 , 8 , 8), CIFAR10 : result = ( batch size , 30 , 10 , 10) P_H2 = nd.Pooling( data=C_H2, pool_type="avg", kernel=(2, 2), stride=(2, 2) ) # MNIST : result = (batch size , 30 , 4 , 4) , CIFAR10 : result = (batch size , 30 , 5 , 5) P_H2 = nd.flatten(data=P_H2) '''FullyConnected parameter • data: (batch_size, input_dim) • weight: (num_hidden, input_dim) • bias: (num_hidden,) • out: (batch_size, num_hidden) ''' F_H1 = nd.Activation(BatchNorm(nd.FullyConnected(data=P_H2, weight=W3, bias=B3, num_hidden=120), gamma3, beta3, scope_name=2, is_training=is_training), act_type="relu") F_H1 = nd.Dropout(data=F_H1, p=drop_rate) F_H2 = nd.Activation(BatchNorm(nd.FullyConnected(data=F_H1, weight=W4, bias=B4, num_hidden=64), gamma4, beta4, scope_name=3, is_training=is_training), act_type="relu") F_H2 = nd.Dropout(data=F_H2, p=drop_rate) softmax_Y = nd.softmax( nd.FullyConnected(data=F_H2, weight=W5, bias=B5, num_hidden=10)) return softmax_Y
import numpy as np