class Linear(Module): ''' Linear Module ''' def __init__(self, in_features, out_features): ''' Initializes weight, bias, gradient of weight and gradient of bias ''' super(Linear, self).__init__() self.weight = FloatTensor(out_features, in_features) self.bias = FloatTensor(out_features).view(-1, 1) self.reset_parameters() self.bias_grad = FloatTensor(self.bias.size()).zero_() self.weight_grad = FloatTensor(self.weight.size()).zero_() self.previous_x = None def reset_parameters(self): ''' Initializes weight and bias with uniform law. Taken from Lecture 5 of Deep Learning ''' std = 1 / math.sqrt(self.weight.size(1)) self.weight.uniform_(-std, std) self.bias.uniform_(-std, std) def forward(self, x): ''' Computes forward step of the Linear module ''' self.previous_x = x return self.weight.matmul(x) + self.bias def backward(self, *gradwrtoutput): ''' Computes backward step of the Linear module ''' self.bias_grad.add_(gradwrtoutput[0].sum(1)) self.weight_grad.add_(gradwrtoutput[0].matmul(self.previous_x.t())) return self.weight.t().matmul(gradwrtoutput[0]) def step(self, eta): ''' Updates the weight and bias after gradient step ''' self.weight = self.weight - eta * self.weight_grad self.bias = self.bias - eta * self.bias_grad def grad_zero(self): ''' Resets the gradients to zero after gradient step ''' self.bias_grad.zero_() self.weight_grad.zero_() def param(self): ''' Return the weight and the bias ''' return [(self.weight, self.weight_grad), (self.bias, self.bias_grad)]
class Linear(Module): def __init__(self, in_features, out_features, bias=True): super(Linear, self).__init__() self.epsilon = 200 self.in_features = in_features self.out_features = out_features self.weight = Tensor(out_features, in_features) self.dweight = Tensor(self.weight.size()) if bias: self.bias = Tensor(out_features) else: self.bias = None self.dbias = Tensor(self.bias.size()) self.previous_input = Tensor() self.current_output = Tensor() self.reset_parameters() def reset_parameters(self): stdv = 1. / math.sqrt(self.epsilon) self.weight.uniform_(0, stdv) if self.bias is not None: self.bias.uniform_(0, stdv) def reset_gradient(self): self.dweight.zero_() self.dbias.zero_() def forward(self, input): self.previous_input = input output = self.weight.mv(input) + self.bias self.current_output = output return output def backward(self, input): dl_ds = dtanh(self.current_output) * input dl_dx = self.weight.t().mv(dl_ds) self.dweight.add_( dl_ds.view(-1, 1).mm(self.previous_input.view(1, -1))) self.dbias.add_(dl_ds) return dl_dx def update_parameters(self, eta): self.weight = self.weight - eta * self.dweight self.bias = self.bias - eta * self.dbias def parameters(self): return self.weight, self.bias
class Linear(Module): ''' Assumption based: Linear Equation : Y = X * W + b. Data Structure : Rows represent data, colums represent features. ''' def __init__(self, input_dim, output_dim, bias=True, initOption='Normal'): super(Linear).__init__() self.name = 'Linear' self.input_dim, self.output_dim = input_dim, output_dim self.w = FloatTensor(input_dim, output_dim) self.gradW = FloatTensor(input_dim, output_dim) self.b = FloatTensor(output_dim) self.gradB = FloatTensor(output_dim) if bias: self.b = FloatTensor(output_dim) self.gradB = FloatTensor(output_dim) else: self.b = None self.gradB = None self.initOption = initOption self.initParameters() def initParameters(self): ''' Different methods for parameter initialization. ''' if self.initOption == 'Normal': self.w.normal_() if self.initOption == 'Zero': self.w.zero_() if self.initOption == 'He': # 'He initialization' recommends for layers with a ReLU activation self.w.normal_().mul_(math.sqrt(2 / (self.input_dim))) if self.initOption == 'Xavier': # 'Xavier initialization' recommends for layers with a tanh activation self.w.normal_().mul_( math.sqrt(2 / (self.input_dim + self.output_dim))) self.gradW.fill_(0) if self.b is not None: self.b.normal_() self.gradB.fill_(0) def forward(self, input): ''' Forward Pass: Y = X * W + b. ''' self.input = input if self.b is not None: self.output = self.input.matmul(self.w).add(self.b) # Broadcast else: self.output = self.input.matmul(self.w) return self.output def backward(self, gradwrtoutput): ''' Backpropagation: gradwrtoutput = batch_size * output_dim dW = X^T * dL/dY db = (dL/dY)^T * I dX = dL/dY * W^T. ''' self.gradW.add_(self.input.t().matmul(gradwrtoutput)) if self.b is not None: self.gradB.add_(gradwrtoutput.sum(0)) return gradwrtoutput.matmul(self.w.t()) def zero_grad(self): ''' Set gradient to 0. ''' self.gradW.zero_() if self.b is not None: self.gradB.zero_() def param(self): ''' Return parameters. ''' if self.b is not None: return [(self.w, self.gradW), (self.b, self.gradB)] else: return [(self.w, self.gradW)]
class Linear(Module): # one fully-connected layer def __init__(self, in_dim, out_dim, eps=1., method='xavier'): self.in_dim = in_dim self.out_dim = out_dim # define weight, bias and their gradient self.w = FloatTensor(out_dim, in_dim) self.dw = FloatTensor(out_dim, in_dim) self.b = FloatTensor(out_dim) self.db = FloatTensor(out_dim) # initialization: defaulted as Xavier if method == 'zero': self.w = self.w.fill_(0) self.b = self.w.fill_(0) elif method == 'normal': self.w = self.w.normal_(mean=0, std=eps) self.w = self.b.normal_(mean=0, std=eps) else: temp_std = 1. / math.sqrt((self.in_dim + self.out_dim) / 2) self.w = self.w.normal_(mean=0, std=temp_std) self.b = self.b.normal_(mean=0, std=temp_std) # zero gradient intialization self.dw = self.dw.zero_() self.db = self.db.zero_() def forward(self, x): # y = w * x + b self.input = x.clone() self.output = self.w.matmul(self.input) + self.b #self.output=self.w @ self.input + self.b return self.output def backward(self, gradwrtoutput): temp_wt = self.w.t() # dw = dL/dy * x temp_dw = gradwrtoutput.view(-1, 1).mm(self.input.view(1, -1)) self.dw.add_(temp_dw) # db = dL/dy temp_db = gradwrtoutput.clone() self.db.add_(temp_db) # dx = w.T * dL/dy temp_dx = temp_wt.matmul(gradwrtoutput) return temp_dx def param(self): return [self.w, self.dw, self.b, self.db] def zero_grad(self): self.dw.zero_() self.db.zero_()