def __init__(self, input_shape, eps=1e-4, momentum=0.1, estimate=True, axes="per-activation"): """ Args: input_shape: eps: momentum: estimate: axes: ('per-activation', 'spatial' or a tuple of ints) The axes along which the input should be normalized. 'per-activation' normalizes per activation and is equal to axes=(0,). 'spatial' shares normalization factors across spatial dimensions (i.e., all dimensions past the second), which for 4D inputs would be equal to axes=(0, 2, 3). """ self.axes = axes self.eps = eps self.momentum = momentum self.estimate = estimate self.running_mean = Constant(0.0)(input_shape) self.running_var = Constant(1.0)(input_shape) if self.estimate: self.gamma = Uniform(0.0, 1.0)(input_shape) self.beta = Constant(0.0)(input_shape)
def __init__(self, n_input_ch, n_output_ch, kernel, stride=(1, 1), pad=(0, 0), dilation=(1, 1), input_shape=None, weight_init=XavierNormal(), bias_init=Constant(0.0), use_bias=True): assert len(kernel) == 2 assert len(stride) == 2 self.n_input_ch = n_input_ch self.n_output_ch = n_output_ch self.kernel = kernel self.stride = stride self.pad = pad self.dilation = dilation self.input_shape = input_shape self.filter = theano.shared( weight_init((n_input_ch, n_output_ch) + kernel)) self.use_bias = use_bias if use_bias: self.bias = theano.shared(bias_init(n_output_ch))
def updates(self, params, cost: tt.Variable) -> OrderedDict: updates = OrderedDict() params = list(params) sum_grad = 0.0 for p in params: d = None if hasattr(self, "state_keys"): d = { k: theano.shared(Constant(0.0)(p.get_value().shape)) for k in self.state_keys } g = tt.grad(cost=cost, wrt=p) sum_grad += g.norm(self.norm) us = self.base_optimizer.update_one(p, g, d) updates.update(us) scale = ifelse(sum_grad > self.threshold, self.threshold / sum_grad, 1.0) for p in params: updates[p] *= scale return updates # class YellowFin(Optimizer): # def __init__(self, lr=0.1, mu=0.0, clip_thresh=None, weight_decay=0.0, beta=0.999, # curv_win_width=20, zero_debias=True, delta_mu=0.0, auto_clip_fac=None): # self.base_optimizer = SGD(lr=lr, momentum=) # self.state_keys =
def __init__(self, n_input, n_output, weight_init=XavierNormal(1.0), bias_init=Constant(0.0)): self.n_input = n_input self.n_output = n_output self.weight = theano.shared(weight_init((n_input, n_output)), name="weight") self.bias = theano.shared(bias_init(n_output), name="bias")
def __init__(self, input_dim, output_dim, weight_init=XavierNormal(), bias_init=Constant(0.0), name="", impl=RNNImpl.auto, n_batch=1): self.n_batch = n_batch self.input_dim = input_dim self.hidden_dim = output_dim self.output_dim = output_dim self.params = [] # TODO: cuDNN conversion self.non_cudnn_params = [] def register(init, shape, name): v = theano.shared(init(shape), name=name) self.params.append(v) self.non_cudnn_params.append(v) return v self.W_i = register(weight_init, (input_dim, output_dim), name=name + ".W_i") self.b_wi = register(bias_init, (output_dim, ), name=name + ".b_wi") self.W_f = register(weight_init, (input_dim, output_dim), name=name + ".W_f") self.b_wf = register(bias_init, (output_dim, ), name=name + ".b_wf") self.W_c = register(weight_init, (input_dim, output_dim), name=name + ".W_c") self.b_wc = register(bias_init, (output_dim, ), name=name + ".b_wc") self.W_o = register(weight_init, (input_dim, output_dim), name=name + ".W_o") self.b_wo = register(bias_init, (output_dim, ), name=name + ".b_wo") self.R_i = register(weight_init, (output_dim, output_dim), name=name + ".R_i") self.b_ri = register(bias_init, (output_dim, ), name=name + ".b_ri") self.R_f = register(weight_init, (output_dim, output_dim), name=name + ".R_f") self.b_rf = register(bias_init, (output_dim, ), name=name + ".b_rf") self.R_c = register(weight_init, (output_dim, output_dim), name=name + ".R_c") self.b_rc = register(bias_init, (output_dim, ), name=name + ".b_rc") self.R_o = register(weight_init, (output_dim, output_dim), name=name + ".R_o") self.b_ro = register(bias_init, (output_dim, ), name=name + ".b_ro") self.impl = impl # NOTE this should be set after all the initialization of params
def updates(self, params, cost: tt.Variable) -> OrderedDict: updates = OrderedDict() for p in params: d = None if hasattr(self, "state_keys"): d = { k: theano.shared(Constant(0.0)(p.get_value().shape)) for k in self.state_keys } g = tt.grad(cost=cost, wrt=p) us = self.update_one(p, g, d) updates.update(us) return updates
def __init__(self, input_dim, output_dim, weight_init=XavierNormal(), bias_init=Constant(0.0), name="", impl=RNNImpl.auto, n_batch=1): self.n_batch = n_batch self.name = name self.input_dim = input_dim self.hidden_dim = output_dim self.output_dim = output_dim self.params = [] # TODO: cuDNN conversion self.non_cudnn_params = [] def register(init, shape, name): v = theano.shared(init(shape), name=name) self.params.append(v) self.non_cudnn_params.append(v) return v # NOTE: do not change this initialization order because of cuDNN transfer self.W_r = register(weight_init, (input_dim, output_dim), name=name + ".W_r") self.b_wr = register(bias_init, (output_dim, ), name=name + ".b_wr") self.W_i = register(weight_init, (input_dim, output_dim), name=name + ".W_i") self.b_wi = register(bias_init, (output_dim, ), name=name + ".b_wi") self.W_h = register(weight_init, (input_dim, output_dim), name=name + ".W_h") self.b_wh = register(bias_init, (output_dim, ), name=name + ".b_wh") self.R_r = register(weight_init, (output_dim, output_dim), name=name + ".R_r") self.b_rr = register(bias_init, (output_dim, ), name=name + ".b_rr") self.R_i = register(weight_init, (output_dim, output_dim), name=name + ".R_i") self.b_ru = register(bias_init, (output_dim, ), name=name + ".b_ru") self.R_h = register(weight_init, (output_dim, output_dim), name=name + ".R_h") self.b_rh = register(bias_init, (output_dim, ), name=name + ".b_rh") self.impl = impl # NOTE this should be set after all the initialization of params
def __init__(self, n_input, n_output, activation=tt.tanh, weight_init=XavierNormal(1.0), bias_init=Constant(0.0), impl=RNNImpl.auto): self.impl = impl self.n_input = n_input self.n_output = n_output self.activation = activation self.weight_hx = theano.shared(weight_init((n_input, n_output)), name="weight_hx") self.weight_hh = theano.shared(weight_init((n_output, n_output)), name="weight_hh") self.bias = theano.shared(bias_init(n_output), name="bias") self.state = None
def _params_to_cudnn(self): from theano.gpuarray import dnn from theano.gpuarray.type import gpuarray_shared_constructor assert dnn.dnn_available(None) self._rnn_block = dnn.RNNBlock(theano.config.floatX, self.hidden_dim, num_layers=1, input_mode="linear", rnn_mode=self.rnn_type, direction_mode="unidirectional") param_size = self._rnn_block.get_param_size( [self.n_batch, self.input_dim]) # TODO: study about n_batch self.params = [gpuarray_shared_constructor(Constant(0.0)(param_size))] cs = self._rnn_block.split_params(self.params[0], layer=0, input_size=[ self.n_batch, self.input_dim ]) # TODO: multi layer support for c, p in zip(cs, self.non_cudnn_params): c[:] = p.get_value(borrow=True, return_internal_type=True)
def __init__(self, n_input_ch, n_output_ch, kernel, weight_init=XavierNormal(), bias_init=Constant(0.0)): assert kernel % 2 == 1 self.kernel = kernel self.pad = (kernel - 1) // 2 self.n_output_ch = n_output_ch self.conv_wx = Conv1D(n_input_ch, n_output_ch * 3, kernel=self.kernel, pad=self.pad, weight_init=weight_init, bias_init=bias_init) self.conv_wh = Conv1D(n_output_ch, n_output_ch * 3, kernel=self.kernel, pad=self.pad, weight_init=weight_init, bias_init=bias_init)
def __init__(self, n_input_ch, n_output_ch, kernel, stride=1, pad=0, dilation=1, input_shape=None, weight_init=XavierNormal(), bias_init=Constant(0.0), use_bias=True): self.n_input_ch = n_input_ch self.n_output_ch = n_output_ch self.kernel = kernel self.stride = stride self.pad = pad self.dilation = dilation self.input_shape = input_shape w = weight_init( (n_input_ch, n_output_ch, kernel, 1)).transpose(1, 0, 2, 3) self.filter = theano.shared(w) self.use_bias = use_bias if use_bias: self.bias = theano.shared(bias_init(n_output_ch))